diff --git a/5_spacy_lang_processing_pipeline/language_processing_exercise.ipynb b/5_spacy_lang_processing_pipeline/language_processing_exercise.ipynb new file mode 100644 index 0000000..c289a1c --- /dev/null +++ b/5_spacy_lang_processing_pipeline/language_processing_exercise.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Yrci22GYhTQP" + }, + "source": [ + "### **Spacy Language Processing Pipelines: Exercises**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YUMPkcohhgam" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hxtliEGIh4gS" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Get all the proper nouns from a given text in a list and also count how many of them.\n", + "- **Proper Noun** means a noun that names a particular person, place, or thing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lRGfbeEshFf-", + "outputId": "f8d6beed-c03a-479c-b7bd-4a21173aba55" + }, + "outputs": [], + "source": [ + "text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and \n", + "visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.\n", + "They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!\n", + "'''\n", + "\n", + "# https://spacy.io/usage/linguistic-features\n", + "\n", + "#creating the nlp object\n", + "doc = nlp(text) \n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WfU6CRIWhFh8" + }, + "source": [ + "**Expected Output**\n", + "\n", + "Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]\n", + "\n", + "Count: 8\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FUr2rnbYmdlv" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Get all companies names from a given text and also the count of them.\n", + "- **Hint**: Use the spacy **ner** functionality " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LLf4xyGEmZ2P", + "outputId": "e9582d9f-4f1e-4574-e3d8-a5526a4fb6cb" + }, + "outputs": [], + "source": [ + "text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in \n", + "India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''\n", + "\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4JK5eMsCmZ5i" + }, + "source": [ + "**Expected Output**\n", + "\n", + "\n", + "Company Names: [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti Airtel]\n", + "\n", + "Count: 10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HkbNaNVChFoB" + }, + "source": [ + "## [**Solution**](./language_processing_exercise_solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Language Processing_exercise.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/5_spacy_lang_processing_pipeline/language_processing_exercise_solutions.ipynb b/5_spacy_lang_processing_pipeline/language_processing_exercise_solutions.ipynb new file mode 100644 index 0000000..07ccc84 --- /dev/null +++ b/5_spacy_lang_processing_pipeline/language_processing_exercise_solutions.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Yrci22GYhTQP" + }, + "source": [ + "### **Spacy Language Processing Pipelines: Solutions**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "YUMPkcohhgam" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hxtliEGIh4gS" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Get all the proper nouns from a given text in a list and also count how many of them.\n", + "- **Proper Noun** means a noun that names a particular person, place, or thing." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lRGfbeEshFf-", + "outputId": "f8d6beed-c03a-479c-b7bd-4a21173aba55" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]\n", + "Count: 8\n" + ] + } + ], + "source": [ + "text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and \n", + "visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.\n", + "They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!\n", + "'''\n", + "\n", + "# https://spacy.io/usage/linguistic-features\n", + "\n", + "#creating the nlp object\n", + "doc = nlp(text) \n", + "\n", + "\n", + "#list for storing the proper nouns\n", + "all_proper_nouns = [] \n", + "\n", + "\n", + "for token in doc:\n", + " if token.pos_ == \"PROPN\": #checking the whether token belongs to parts of speech \"PROPN\" [Proper Noun]\n", + " all_proper_nouns.append(token)\n", + " \n", + "\n", + "#finally printing the results\n", + "print(\"Proper Nouns: \", all_proper_nouns)\n", + "print(\"Count: \", len(all_proper_nouns))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FUr2rnbYmdlv" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Get all companies names from a given text and also the count of them.\n", + "- **Hint**: Use the spacy **ner** functionality " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LLf4xyGEmZ2P", + "outputId": "e9582d9f-4f1e-4574-e3d8-a5526a4fb6cb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Company Names: [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti Airtel]\n", + "Count: 10\n" + ] + } + ], + "source": [ + "text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in \n", + "India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''\n", + "\n", + "\n", + "doc = nlp(text)\n", + "\n", + "#list for storing the company names\n", + "all_company_names = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'ORG': #checking the whether token belongs to entity \"ORG\" [Organisation]\n", + " all_company_names.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"Company Names: \", all_company_names)\n", + "print(\"Count: \", len(all_company_names))" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Language Processing_exercise.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/6_stemming_lematization/stemming_and_lemmatization_exercise.ipynb b/6_stemming_lematization/stemming_and_lemmatization_exercise.ipynb new file mode 100644 index 0000000..a549423 --- /dev/null +++ b/6_stemming_lematization/stemming_and_lemmatization_exercise.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-eWVL6QPtJqx" + }, + "source": [ + "### **Stemming and Lemmatization: Exercises**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **Run this cell to import all necessary packages**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fn0J_mWHtM57" + }, + "outputs": [], + "source": [ + "#let import necessary libraries and create the object\n", + "\n", + "#for nltk\n", + "import nltk\n", + "from nltk.stem import PorterStemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "#downloading all neccessary packages related to nltk\n", + "nltk.download('all')\n", + "\n", + "\n", + "#for spacy\n", + "import spacy\n", + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xCjMD77juhOm" + }, + "source": [ + "**Exercise1:** \n", + "- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations\n", + "- Write a short note on the words that have different base words using stemming and Lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Ytgw50jtM_h", + "outputId": "47b1cf60-7eb6-449c-d136-01289a08abf0" + }, + "outputs": [], + "source": [ + "#using stemming in nltk\n", + "lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uU7oPj0ftNCP", + "outputId": "8bf39249-3afb-46a2-9e4e-9fc75be31cc4" + }, + "outputs": [], + "source": [ + "#using lemmatization in spacy\n", + "\n", + "doc = nlp(\"running painting walking dressing likely children who good ate fishing\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLjsQGjVye5W" + }, + "source": [ + "**Exercise2:**\n", + "\n", + "- convert the given text into it's base form using both stemming and lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0weT7a9tygnk" + }, + "outputs": [], + "source": [ + "text = \"\"\"Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a \n", + "habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "id": "JP29y7cjygqa", + "outputId": "6cdc08a3-d313-495a-c0a6-34366848b27e" + }, + "outputs": [], + "source": [ + "#using stemming in nltk\n", + "\n", + "\n", + "#step1: Word tokenizing\n", + "\n", + "\n", + "\n", + "\n", + "#step2: getting the base form for each token using stemmer\n", + "\n", + "\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hysIhJuxyg0B", + "outputId": "da5f4be7-48ce-44de-a783-866571f796b9" + }, + "outputs": [], + "source": [ + "#using lemmatisation in spacy\n", + "\n", + "\n", + "#step1: Creating the object for the given text\n", + "\n", + "\n", + "\n", + "#step2: getting the base form for each token using spacy 'lemma_'\n", + "\n", + "\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [**Solution**](./stemming_and_lemmatization_solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Stemming and Lemmatization.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/6_stemming_lematization/stemming_and_lemmatization_solutions.ipynb b/6_stemming_lematization/stemming_and_lemmatization_solutions.ipynb new file mode 100644 index 0000000..80a6be8 --- /dev/null +++ b/6_stemming_lematization/stemming_and_lemmatization_solutions.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-eWVL6QPtJqx" + }, + "source": [ + "### **Stemming and Lemmatization: Solutions**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qWR0ho4HGnWL" + }, + "source": [ + "- **Run this cell to import all necessary packages**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fn0J_mWHtM57" + }, + "outputs": [], + "source": [ + "#let import necessary libraries and create the object\n", + "\n", + "#for nltk\n", + "import nltk\n", + "from nltk.stem import PorterStemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "#downloading all neccessary packages related to nltk\n", + "nltk.download('all')\n", + "\n", + "\n", + "#for spacy\n", + "import spacy\n", + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xCjMD77juhOm" + }, + "source": [ + "**Exercise1:** \n", + "- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations\n", + "- Write a short note on the words that have different base words using stemming and Lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Ytgw50jtM_h", + "outputId": "ad4ffe2a-698e-47f4-cb9d-c9b0d6501dcb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running | run\n", + "painting | paint\n", + "walking | walk\n", + "dressing | dress\n", + "likely | like\n", + "children | children\n", + "whom | whom\n", + "good | good\n", + "ate | ate\n", + "fishing | fish\n" + ] + } + ], + "source": [ + "#using stemming in nltk\n", + "lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']\n", + "\n", + "for word in lst_words:\n", + " print(f\"{word} | {stemmer.stem(word)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uU7oPj0ftNCP", + "outputId": "a6af545c-b1ca-462b-dd28-9d8b6b713185" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running | run\n", + "painting | painting\n", + "walking | walk\n", + "dressing | dress\n", + "likely | likely\n", + "children | child\n", + "whom | whom\n", + "good | good\n", + "ate | eat\n", + "fishing | fishing\n" + ] + } + ], + "source": [ + "#using lemmatization in spacy\n", + "\n", + "doc = nlp(\"running painting walking dressing likely children whom good ate fishing\")\n", + "for token in doc:\n", + " print(token, \" | \", token.lemma_)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zIM2fumjGyxe" + }, + "source": [ + "**Observations**\n", + "\n", + "- Words that are different in stemming and lemmatization are:\n", + " - painting\n", + " - likely\n", + " - children\n", + " - ate\n", + " - fishing\n", + "\n", + "- As Stemming achieves the base word by removing the **suffixes** [ing, ly etc], so it successfully transform the words like 'painting', 'likely', 'fishing' and lemmatization fails for some words ending with suffixes here.\n", + "\n", + "- As Lemmatization uses the **dictionary** meanings while converting to the base form, so words like 'children' and 'ate' are successfully transformed and stemming fails here." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLjsQGjVye5W" + }, + "source": [ + "**Exercise2:**\n", + "\n", + "- convert the given text into it's base form using both stemming and lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "0weT7a9tygnk" + }, + "outputs": [], + "source": [ + "text = \"\"\"Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a \n", + "habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JP29y7cjygqa", + "outputId": "da12a10f-799b-4320-b575-4e86d21fa43e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .\n" + ] + } + ], + "source": [ + "#using stemming in nltk\n", + "\n", + "#step1: Word tokenizing\n", + "all_word_tokens = nltk.word_tokenize(text)\n", + "\n", + "\n", + "#step2: getting the base form for each token using stemmer\n", + "all_base_words = []\n", + "\n", + "for token in all_word_tokens:\n", + " base_form = stemmer.stem(token)\n", + " all_base_words.append(base_form)\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n", + "final_base_text = ' '.join(all_base_words)\n", + "print(final_base_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hysIhJuxyg0B", + "outputId": "c76eeb91-a7b7-4f8e-f3c0-f0f4692580c5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Latha be very multi talented girl . she be good at many skill like dancing , run , singing , playing . she also like eat Pav Bhagi . she have a \n", + " habit of fishing and swim too . besides all this , she be a wonderful at cooking too . \n", + "\n" + ] + } + ], + "source": [ + "#using lemmatisation in spacy\n", + "\n", + "\n", + "#step1: Creating the object for the given text\n", + "doc = nlp(text)\n", + "all_base_words = []\n", + "\n", + "#step2: getting the base form for each token using spacy 'lemma_'\n", + "for token in doc:\n", + " base_word = token.lemma_\n", + " all_base_words.append(base_word)\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n", + "final_base_text = ' '.join(all_base_words)\n", + "print(final_base_text)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "stemming_and_lemmatization_solutions.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/8_NER/named_entity_recognition_exercise.ipynb b/8_NER/named_entity_recognition_exercise.ipynb new file mode 100644 index 0000000..1ff3860 --- /dev/null +++ b/8_NER/named_entity_recognition_exercise.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ibF7pjNyhBrR" + }, + "source": [ + "### **Named Entity Recognition (NER): Exercises**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "liI13P5Eg_w-" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFR-Ep2DhXXa" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Extract all the Geographical (cities, Countries, states) names from a given text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HgYXmU3phKbg", + "outputId": "ff6b90e8-ee46-4017-b185-c7b53f65d625" + }, + "outputs": [], + "source": [ + "text = \"\"\"Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that\n", + "in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,\n", + "in Bihar it is Litti Chowkha and so on for all other states\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "\n", + "Geographical location Names: [India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]\n", + "\n", + "Count: 7" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fJT4UnUYkpSP" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Extract all the birth dates of cricketers in the given Text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pOMngJ2chKda", + "outputId": "bbfb7cd5-3088-4503-8d94-aa263aac0608" + }, + "outputs": [], + "source": [ + "text = \"\"\"Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981\n", + "and finally Ricky ponting was born on 19 December 1974.\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "\n", + "All Birth Dates: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]\n", + "\n", + "Count: 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [Solution](./named_entity_recognition_solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Named Entity Recognition (NER).ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/8_NER/named_entity_recognition_solutions.ipynb b/8_NER/named_entity_recognition_solutions.ipynb new file mode 100644 index 0000000..49e1ff9 --- /dev/null +++ b/8_NER/named_entity_recognition_solutions.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ibF7pjNyhBrR" + }, + "source": [ + "### **Named Entity Recognition (NER): Solutions**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "liI13P5Eg_w-" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFR-Ep2DhXXa" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Extract all the Geographical (cities, Countries, states) names from a given text" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HgYXmU3phKbg", + "outputId": "ff6b90e8-ee46-4017-b185-c7b53f65d625" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Geographical location Names: [India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]\n", + "Count: 7\n" + ] + } + ], + "source": [ + "text = \"\"\"Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that\n", + "in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,\n", + "in Bihar it is Litti Chowkha and so on for all other states\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "#list for storing all the names\n", + "all_gpe_names = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'GPE': #checking the whether token belongs to entity \"GPE\" [Geographical location]\n", + " all_gpe_names.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"Geographical location Names: \", all_gpe_names)\n", + "print(\"Count: \", len(all_gpe_names))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fJT4UnUYkpSP" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Extract all the birth dates of cricketers in the given Text" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pOMngJ2chKda", + "outputId": "bbfb7cd5-3088-4503-8d94-aa263aac0608" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All Birth Dates: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]\n", + "Count: 4\n" + ] + } + ], + "source": [ + "text = \"\"\"Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981\n", + "and finally Ricky ponting was born on 19 December 1974.\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "#list for storing all the dates\n", + "all_birth_dates = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'DATE': #checking the whether token belongs to entity \"DATE\" [Dates]\n", + " all_birth_dates.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"All Birth Dates: \", all_birth_dates)\n", + "print(\"Count: \", len(all_birth_dates))" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Named Entity Recognition (NER).ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}