From b7b6b3e56e8c3e2f4d3780eda2b23e095e654c23 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Mon, 11 Jul 2022 09:01:40 +0530 Subject: [PATCH 01/12] Add files via upload Have added Exercises and solutions for the language processing Pipeline(Video-9) tutorial. --- .../Language_Processing_exercise.ipynb | 208 +++++++++++++++++ ...nguage_Processing_exercise_solutions.ipynb | 218 ++++++++++++++++++ 2 files changed, 426 insertions(+) create mode 100644 5_spacy_lang_processing_pipeline/Language_Processing_exercise.ipynb create mode 100644 5_spacy_lang_processing_pipeline/Language_Processing_exercise_solutions.ipynb diff --git a/5_spacy_lang_processing_pipeline/Language_Processing_exercise.ipynb b/5_spacy_lang_processing_pipeline/Language_Processing_exercise.ipynb new file mode 100644 index 0000000..46fa81c --- /dev/null +++ b/5_spacy_lang_processing_pipeline/Language_Processing_exercise.ipynb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Yrci22GYhTQP" + }, + "source": [ + "### **Spacy Language Processing Pipelines: Exercises**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YUMPkcohhgam" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0i2XdEcDhggU" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hxtliEGIh4gS" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Get all the proper nouns from a given text in a list and also count how many of them.\n", + "- **Proper Noun** means a noun that names a particular person, place, or thing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lRGfbeEshFf-", + "outputId": "f8d6beed-c03a-479c-b7bd-4a21173aba55" + }, + "outputs": [], + "source": [ + "text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and \n", + "visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.\n", + "They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!\n", + "'''\n", + "\n", + "# https://spacy.io/usage/linguistic-features\n", + "\n", + "#creating the nlp object\n", + "doc = nlp(text) \n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WfU6CRIWhFh8" + }, + "source": [ + "**Expected Output**\n", + "\n", + "Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]\n", + "\n", + "Count: 8\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bcne4zNDmZ0C" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FUr2rnbYmdlv" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Get all companies names from a given text and also the count of them.\n", + "- **Hint**: Use the spacy **ner** functionality " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LLf4xyGEmZ2P", + "outputId": "e9582d9f-4f1e-4574-e3d8-a5526a4fb6cb" + }, + "outputs": [], + "source": [ + "text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in \n", + "India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''\n", + "\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4JK5eMsCmZ5i" + }, + "source": [ + "**Expected Output**\n", + "\n", + "\n", + "Company Names: [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti Airtel]\n", + "\n", + "Count: 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tn36z3pvhFlM" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HkbNaNVChFoB" + }, + "source": [ + "## [**Solution**](./Language_Processing_exercise_solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Language Processing_exercise.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/5_spacy_lang_processing_pipeline/Language_Processing_exercise_solutions.ipynb b/5_spacy_lang_processing_pipeline/Language_Processing_exercise_solutions.ipynb new file mode 100644 index 0000000..bccc86a --- /dev/null +++ b/5_spacy_lang_processing_pipeline/Language_Processing_exercise_solutions.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Yrci22GYhTQP" + }, + "source": [ + "### **Spacy Language Processing Pipelines: Solutions**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AZ97K0-xhFcy" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "YUMPkcohhgam" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0i2XdEcDhggU" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hxtliEGIh4gS" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Get all the proper nouns from a given text in a list and also count how many of them.\n", + "- **Proper Noun** means a noun that names a particular person, place, or thing." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lRGfbeEshFf-", + "outputId": "f8d6beed-c03a-479c-b7bd-4a21173aba55" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]\n", + "Count: 8\n" + ] + } + ], + "source": [ + "text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and \n", + "visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.\n", + "They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!\n", + "'''\n", + "\n", + "# https://spacy.io/usage/linguistic-features\n", + "\n", + "#creating the nlp object\n", + "doc = nlp(text) \n", + "\n", + "\n", + "#list for storing the proper nouns\n", + "all_proper_nouns = [] \n", + "\n", + "\n", + "for token in doc:\n", + " if token.pos_ == \"PROPN\": #checking the whether token belongs to parts of speech \"PROPN\" [Proper Noun]\n", + " all_proper_nouns.append(token)\n", + " \n", + "\n", + "#finally printing the results\n", + "print(\"Proper Nouns: \", all_proper_nouns)\n", + "print(\"Count: \", len(all_proper_nouns))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WfU6CRIWhFh8" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bcne4zNDmZ0C" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FUr2rnbYmdlv" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Get all companies names from a given text and also the count of them.\n", + "- **Hint**: Use the spacy **ner** functionality " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LLf4xyGEmZ2P", + "outputId": "e9582d9f-4f1e-4574-e3d8-a5526a4fb6cb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Company Names: [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti Airtel]\n", + "Count: 10\n" + ] + } + ], + "source": [ + "text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in \n", + "India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''\n", + "\n", + "\n", + "doc = nlp(text)\n", + "\n", + "#list for storing the company names\n", + "all_company_names = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'ORG': #checking the whether token belongs to entity \"ORG\" [Organisation]\n", + " all_company_names.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"Company Names: \", all_company_names)\n", + "print(\"Count: \", len(all_company_names))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4JK5eMsCmZ5i" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "HkbNaNVChFoB" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Language Processing_exercise.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 29f2c3236db66c6299359e65500c5f466d252789 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Mon, 11 Jul 2022 09:03:34 +0530 Subject: [PATCH 02/12] Add files via upload Have added the exercises and solutions for the Stemming and Lemmatization (Video-10) Tutorial. --- .../Stemming_and_Lemmatization_Exercise.ipynb | 262 +++++++++++++++ ...Stemming_and_Lemmatization_Solutions.ipynb | 315 ++++++++++++++++++ 2 files changed, 577 insertions(+) create mode 100644 6_stemming_lematization/Stemming_and_Lemmatization_Exercise.ipynb create mode 100644 6_stemming_lematization/Stemming_and_Lemmatization_Solutions.ipynb diff --git a/6_stemming_lematization/Stemming_and_Lemmatization_Exercise.ipynb b/6_stemming_lematization/Stemming_and_Lemmatization_Exercise.ipynb new file mode 100644 index 0000000..7767378 --- /dev/null +++ b/6_stemming_lematization/Stemming_and_Lemmatization_Exercise.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-eWVL6QPtJqx" + }, + "source": [ + "### **Stemming and Lemmatization: Exercises**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q-myRSrdtGsn" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **Run this cell to import all necessary packages**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fn0J_mWHtM57" + }, + "outputs": [], + "source": [ + "#let import necessary libraries and create the object\n", + "\n", + "#for nltk\n", + "import nltk\n", + "from nltk.stem import PorterStemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "#downloading all neccessary packages related to nltk\n", + "nltk.download('all')\n", + "\n", + "\n", + "#for spacy\n", + "import spacy\n", + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8cmEWYRztM8E" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xCjMD77juhOm" + }, + "source": [ + "**Exercise1:** \n", + "- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations\n", + "- Write a short note on the words that have different base words using stemming and Lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Ytgw50jtM_h", + "outputId": "47b1cf60-7eb6-449c-d136-01289a08abf0" + }, + "outputs": [], + "source": [ + "#using stemming in nltk\n", + "lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uU7oPj0ftNCP", + "outputId": "8bf39249-3afb-46a2-9e4e-9fc75be31cc4" + }, + "outputs": [], + "source": [ + "#using lemmatization in spacy\n", + "\n", + "doc = nlp(\"running painting walking dressing likely children who good ate fishing\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWyzJcFStGvf" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8E9iNZvQtGxk" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLjsQGjVye5W" + }, + "source": [ + "**Exercise2:**\n", + "\n", + "- convert the given text into it's base form using both stemming and lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0weT7a9tygnk" + }, + "outputs": [], + "source": [ + "text = \"\"\"Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a \n", + "habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "id": "JP29y7cjygqa", + "outputId": "6cdc08a3-d313-495a-c0a6-34366848b27e" + }, + "outputs": [], + "source": [ + "#using stemming in nltk\n", + "\n", + "\n", + "#step1: Word tokenizing\n", + "\n", + "\n", + "\n", + "\n", + "#step2: getting the base form for each token using stemmer\n", + "\n", + "\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hysIhJuxyg0B", + "outputId": "da5f4be7-48ce-44de-a783-866571f796b9" + }, + "outputs": [], + "source": [ + "#using lemmatisation in spacy\n", + "\n", + "\n", + "#step1: Creating the object for the given text\n", + "\n", + "\n", + "\n", + "#step2: getting the base form for each token using spacy 'lemma_'\n", + "\n", + "\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j0PsCauBtG00" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [**Solution**](./Stemming_and_Lemmatization_Solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Stemming and Lemmatization.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/6_stemming_lematization/Stemming_and_Lemmatization_Solutions.ipynb b/6_stemming_lematization/Stemming_and_Lemmatization_Solutions.ipynb new file mode 100644 index 0000000..45e5947 --- /dev/null +++ b/6_stemming_lematization/Stemming_and_Lemmatization_Solutions.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-eWVL6QPtJqx" + }, + "source": [ + "### **Stemming and Lemmatization: Solutions**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q-myRSrdtGsn" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **Run this cell to import all necessary packages**" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "fn0J_mWHtM57" + }, + "outputs": [], + "source": [ + "#let import necessary libraries and create the object\n", + "\n", + "#for nltk\n", + "import nltk\n", + "from nltk.stem import PorterStemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "#downloading all neccessary packages related to nltk\n", + "nltk.download('all')\n", + "\n", + "\n", + "#for spacy\n", + "import spacy\n", + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "id": "8cmEWYRztM8E" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xCjMD77juhOm" + }, + "source": [ + "**Exercise1:** \n", + "- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations\n", + "- Write a short note on the words that have different base words using stemming and Lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Ytgw50jtM_h", + "outputId": "47b1cf60-7eb6-449c-d136-01289a08abf0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running | run\n", + "painting | paint\n", + "walking | walk\n", + "dressing | dress\n", + "likely | like\n", + "children | children\n", + "whom | whom\n", + "good | good\n", + "ate | ate\n", + "fishing | fish\n" + ] + } + ], + "source": [ + "#using stemming in nltk\n", + "lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']\n", + "\n", + "for word in lst_words:\n", + " print(f\"{word} | {stemmer.stem(word)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uU7oPj0ftNCP", + "outputId": "8bf39249-3afb-46a2-9e4e-9fc75be31cc4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running | run\n", + "painting | painting\n", + "walking | walk\n", + "dressing | dress\n", + "likely | likely\n", + "children | child\n", + "who | who\n", + "good | good\n", + "ate | eat\n", + "fishing | fishing\n" + ] + } + ], + "source": [ + "#using lemmatization in spacy\n", + "\n", + "doc = nlp(\"running painting walking dressing likely children who good ate fishing\")\n", + "for token in doc:\n", + " print(token, \" | \", token.lemma_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FWyzJcFStGvf" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8E9iNZvQtGxk" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLjsQGjVye5W" + }, + "source": [ + "**Exercise2:**\n", + "\n", + "- convert the given text into it's base form using both stemming and lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "0weT7a9tygnk" + }, + "outputs": [], + "source": [ + "text = \"\"\"Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a \n", + "habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "id": "JP29y7cjygqa", + "outputId": "6cdc08a3-d313-495a-c0a6-34366848b27e" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .'" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#using stemming in nltk\n", + "\n", + "#step1: Word tokenizing\n", + "all_word_tokens = nltk.word_tokenize(text)\n", + "\n", + "\n", + "#step2: getting the base form for each token using stemmer\n", + "all_base_words = []\n", + "\n", + "for token in all_word_tokens:\n", + " base_form = stemmer.stem(token)\n", + " all_base_words.append(base_form)\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n", + "final_base_text = ' '.join(all_base_words)\n", + "print(final_base_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hysIhJuxyg0B", + "outputId": "da5f4be7-48ce-44de-a783-866571f796b9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Latha be very multi talented girl . she be good at many skill like dancing , run , singing , playing . she also like eat Pav Bhagi . she have a \n", + " habit of fishing and swim too . besides all this , she be a wonderful at cooking too . \n", + "\n" + ] + } + ], + "source": [ + "#using lemmatisation in spacy\n", + "\n", + "\n", + "#step1: Creating the object for the given text\n", + "doc = nlp(text)\n", + "all_base_words = []\n", + "\n", + "#step2: getting the base form for each token using spacy 'lemma_'\n", + "for token in doc:\n", + " base_word = token.lemma_\n", + " all_base_words.append(base_word)\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n", + "final_base_text = ' '.join(all_base_words)\n", + "print(final_base_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j0PsCauBtG00" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Stemming and Lemmatization.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 8b9b01b6064f7c264849c176441e4e0aa10e2578 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Mon, 11 Jul 2022 09:04:35 +0530 Subject: [PATCH 03/12] Add files via upload have added exercises and solutions for NER (Video-12) Tutorial. --- 8_NER/Named_Entity_Recognition_Exercise.ipynb | 205 ++++++++++++++++ .../Named_Entity_Recognition_Solutions.ipynb | 223 ++++++++++++++++++ 2 files changed, 428 insertions(+) create mode 100644 8_NER/Named_Entity_Recognition_Exercise.ipynb create mode 100644 8_NER/Named_Entity_Recognition_Solutions.ipynb diff --git a/8_NER/Named_Entity_Recognition_Exercise.ipynb b/8_NER/Named_Entity_Recognition_Exercise.ipynb new file mode 100644 index 0000000..45a26a8 --- /dev/null +++ b/8_NER/Named_Entity_Recognition_Exercise.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ibF7pjNyhBrR" + }, + "source": [ + "### **Named Entity Recognition (NER): Exercises**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zmQPX8Rsg_uU" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "liI13P5Eg_w-" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FgUhnqY2hKWN" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vNfOBYsWhKX5" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFR-Ep2DhXXa" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Extract all the Geographical (cities, Countries, states) names from a given text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HgYXmU3phKbg", + "outputId": "ff6b90e8-ee46-4017-b185-c7b53f65d625" + }, + "outputs": [], + "source": [ + "text = \"\"\"Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that\n", + "in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,\n", + "in Bihar it is Litti Chowkha and so on for all other states\"\"\"\n", + "\n", + "doc = nlp(text)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "\n", + "Geographical location Names: [India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]\n", + "\n", + "Count: 7" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "itW9bgUXhjDW" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "St3zoLXOhjLi" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fJT4UnUYkpSP" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Extract all the birth dates of cricketers in the given Text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pOMngJ2chKda", + "outputId": "bbfb7cd5-3088-4503-8d94-aa263aac0608" + }, + "outputs": [], + "source": [ + "text = \"\"\"Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981\n", + "and finally Ricky ponting was born on 19 December 1974.\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "\n", + "All Birth Dates: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]\n", + "\n", + "Count: 4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "v2Bj1o9thKgy" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "t_OAhVlahKiv" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [Solution](./Named_Entity_Recognition_Solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Named Entity Recognition (NER).ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/8_NER/Named_Entity_Recognition_Solutions.ipynb b/8_NER/Named_Entity_Recognition_Solutions.ipynb new file mode 100644 index 0000000..090afa8 --- /dev/null +++ b/8_NER/Named_Entity_Recognition_Solutions.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ibF7pjNyhBrR" + }, + "source": [ + "### **Named Entity Recognition (NER): Solutions**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zmQPX8Rsg_uU" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "liI13P5Eg_w-" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FgUhnqY2hKWN" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vNfOBYsWhKX5" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFR-Ep2DhXXa" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Extract all the Geographical (cities, Countries, states) names from a given text" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HgYXmU3phKbg", + "outputId": "ff6b90e8-ee46-4017-b185-c7b53f65d625" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Geographical location Names: [India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]\n", + "Count: 7\n" + ] + } + ], + "source": [ + "text = \"\"\"Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that\n", + "in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,\n", + "in Bihar it is Litti Chowkha and so on for all other states\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "#list for storing all the names\n", + "all_gpe_names = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'GPE': #checking the whether token belongs to entity \"GPE\" [Geographical location]\n", + " all_gpe_names.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"Geographical location Names: \", all_gpe_names)\n", + "print(\"Count: \", len(all_gpe_names))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "itW9bgUXhjDW" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "St3zoLXOhjLi" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fJT4UnUYkpSP" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Extract all the birth dates of cricketers in the given Text" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pOMngJ2chKda", + "outputId": "bbfb7cd5-3088-4503-8d94-aa263aac0608" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All Birth Dates: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]\n", + "Count: 4\n" + ] + } + ], + "source": [ + "text = \"\"\"Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981\n", + "and finally Ricky ponting was born on 19 December 1974.\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "#list for storing all the dates\n", + "all_birth_dates = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'DATE': #checking the whether token belongs to entity \"DATE\" [Dates]\n", + " all_birth_dates.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"All Birth Dates: \", all_birth_dates)\n", + "print(\"Count: \", len(all_birth_dates))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "v2Bj1o9thKgy" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "t_OAhVlahKiv" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Named Entity Recognition (NER).ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 6909db4b22ff9d2939a36c42bc98c943978c366f Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:42:04 +0530 Subject: [PATCH 04/12] Delete Language_Processing_exercise.ipynb --- .../Language_Processing_exercise.ipynb | 208 ------------------ 1 file changed, 208 deletions(-) delete mode 100644 5_spacy_lang_processing_pipeline/Language_Processing_exercise.ipynb diff --git a/5_spacy_lang_processing_pipeline/Language_Processing_exercise.ipynb b/5_spacy_lang_processing_pipeline/Language_Processing_exercise.ipynb deleted file mode 100644 index 46fa81c..0000000 --- a/5_spacy_lang_processing_pipeline/Language_Processing_exercise.ipynb +++ /dev/null @@ -1,208 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Yrci22GYhTQP" - }, - "source": [ - "### **Spacy Language Processing Pipelines: Exercises**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YUMPkcohhgam" - }, - "outputs": [], - "source": [ - "#importing necessary libraries \n", - "import spacy\n", - "\n", - "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0i2XdEcDhggU" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hxtliEGIh4gS" - }, - "source": [ - "#### **Excersie: 1**\n", - "\n", - "- Get all the proper nouns from a given text in a list and also count how many of them.\n", - "- **Proper Noun** means a noun that names a particular person, place, or thing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lRGfbeEshFf-", - "outputId": "f8d6beed-c03a-479c-b7bd-4a21173aba55" - }, - "outputs": [], - "source": [ - "text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and \n", - "visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.\n", - "They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!\n", - "'''\n", - "\n", - "# https://spacy.io/usage/linguistic-features\n", - "\n", - "#creating the nlp object\n", - "doc = nlp(text) \n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WfU6CRIWhFh8" - }, - "source": [ - "**Expected Output**\n", - "\n", - "Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]\n", - "\n", - "Count: 8\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bcne4zNDmZ0C" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FUr2rnbYmdlv" - }, - "source": [ - "#### **Excersie: 2**\n", - "\n", - "- Get all companies names from a given text and also the count of them.\n", - "- **Hint**: Use the spacy **ner** functionality " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LLf4xyGEmZ2P", - "outputId": "e9582d9f-4f1e-4574-e3d8-a5526a4fb6cb" - }, - "outputs": [], - "source": [ - "text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in \n", - "India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''\n", - "\n", - "\n", - "doc = nlp(text)\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4JK5eMsCmZ5i" - }, - "source": [ - "**Expected Output**\n", - "\n", - "\n", - "Company Names: [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti Airtel]\n", - "\n", - "Count: 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tn36z3pvhFlM" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HkbNaNVChFoB" - }, - "source": [ - "## [**Solution**](./Language_Processing_exercise_solutions.ipynb)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Language Processing_exercise.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From 1000c1df3b7077e36001ca224417825bfb9669c3 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:42:17 +0530 Subject: [PATCH 05/12] Delete Language_Processing_exercise_solutions.ipynb --- ...nguage_Processing_exercise_solutions.ipynb | 218 ------------------ 1 file changed, 218 deletions(-) delete mode 100644 5_spacy_lang_processing_pipeline/Language_Processing_exercise_solutions.ipynb diff --git a/5_spacy_lang_processing_pipeline/Language_Processing_exercise_solutions.ipynb b/5_spacy_lang_processing_pipeline/Language_Processing_exercise_solutions.ipynb deleted file mode 100644 index bccc86a..0000000 --- a/5_spacy_lang_processing_pipeline/Language_Processing_exercise_solutions.ipynb +++ /dev/null @@ -1,218 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Yrci22GYhTQP" - }, - "source": [ - "### **Spacy Language Processing Pipelines: Solutions**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AZ97K0-xhFcy" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "YUMPkcohhgam" - }, - "outputs": [], - "source": [ - "#importing necessary libraries \n", - "import spacy\n", - "\n", - "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0i2XdEcDhggU" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hxtliEGIh4gS" - }, - "source": [ - "#### **Excersie: 1**\n", - "\n", - "- Get all the proper nouns from a given text in a list and also count how many of them.\n", - "- **Proper Noun** means a noun that names a particular person, place, or thing." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lRGfbeEshFf-", - "outputId": "f8d6beed-c03a-479c-b7bd-4a21173aba55" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]\n", - "Count: 8\n" - ] - } - ], - "source": [ - "text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and \n", - "visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.\n", - "They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!\n", - "'''\n", - "\n", - "# https://spacy.io/usage/linguistic-features\n", - "\n", - "#creating the nlp object\n", - "doc = nlp(text) \n", - "\n", - "\n", - "#list for storing the proper nouns\n", - "all_proper_nouns = [] \n", - "\n", - "\n", - "for token in doc:\n", - " if token.pos_ == \"PROPN\": #checking the whether token belongs to parts of speech \"PROPN\" [Proper Noun]\n", - " all_proper_nouns.append(token)\n", - " \n", - "\n", - "#finally printing the results\n", - "print(\"Proper Nouns: \", all_proper_nouns)\n", - "print(\"Count: \", len(all_proper_nouns))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WfU6CRIWhFh8" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bcne4zNDmZ0C" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FUr2rnbYmdlv" - }, - "source": [ - "#### **Excersie: 2**\n", - "\n", - "- Get all companies names from a given text and also the count of them.\n", - "- **Hint**: Use the spacy **ner** functionality " - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LLf4xyGEmZ2P", - "outputId": "e9582d9f-4f1e-4574-e3d8-a5526a4fb6cb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Company Names: [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti Airtel]\n", - "Count: 10\n" - ] - } - ], - "source": [ - "text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in \n", - "India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''\n", - "\n", - "\n", - "doc = nlp(text)\n", - "\n", - "#list for storing the company names\n", - "all_company_names = []\n", - "\n", - "for ent in doc.ents:\n", - " if ent.label_ == 'ORG': #checking the whether token belongs to entity \"ORG\" [Organisation]\n", - " all_company_names.append(ent)\n", - "\n", - "\n", - "\n", - "#finally printing the results\n", - "print(\"Company Names: \", all_company_names)\n", - "print(\"Count: \", len(all_company_names))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4JK5eMsCmZ5i" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "id": "HkbNaNVChFoB" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Language Processing_exercise.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From 9cfee4f83787c4e8fe8fc731c9fa15cfe0d39e0a Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:42:35 +0530 Subject: [PATCH 06/12] Delete Stemming_and_Lemmatization_Exercise.ipynb --- .../Stemming_and_Lemmatization_Exercise.ipynb | 262 ------------------ 1 file changed, 262 deletions(-) delete mode 100644 6_stemming_lematization/Stemming_and_Lemmatization_Exercise.ipynb diff --git a/6_stemming_lematization/Stemming_and_Lemmatization_Exercise.ipynb b/6_stemming_lematization/Stemming_and_Lemmatization_Exercise.ipynb deleted file mode 100644 index 7767378..0000000 --- a/6_stemming_lematization/Stemming_and_Lemmatization_Exercise.ipynb +++ /dev/null @@ -1,262 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "-eWVL6QPtJqx" - }, - "source": [ - "### **Stemming and Lemmatization: Exercises**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q-myRSrdtGsn" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- **Run this cell to import all necessary packages**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fn0J_mWHtM57" - }, - "outputs": [], - "source": [ - "#let import necessary libraries and create the object\n", - "\n", - "#for nltk\n", - "import nltk\n", - "from nltk.stem import PorterStemmer\n", - "stemmer = PorterStemmer()\n", - "\n", - "#downloading all neccessary packages related to nltk\n", - "nltk.download('all')\n", - "\n", - "\n", - "#for spacy\n", - "import spacy\n", - "nlp = spacy.load(\"en_core_web_sm\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8cmEWYRztM8E" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xCjMD77juhOm" - }, - "source": [ - "**Exercise1:** \n", - "- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations\n", - "- Write a short note on the words that have different base words using stemming and Lemmatization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5Ytgw50jtM_h", - "outputId": "47b1cf60-7eb6-449c-d136-01289a08abf0" - }, - "outputs": [], - "source": [ - "#using stemming in nltk\n", - "lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "uU7oPj0ftNCP", - "outputId": "8bf39249-3afb-46a2-9e4e-9fc75be31cc4" - }, - "outputs": [], - "source": [ - "#using lemmatization in spacy\n", - "\n", - "doc = nlp(\"running painting walking dressing likely children who good ate fishing\")\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWyzJcFStGvf" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8E9iNZvQtGxk" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CLjsQGjVye5W" - }, - "source": [ - "**Exercise2:**\n", - "\n", - "- convert the given text into it's base form using both stemming and lemmatization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0weT7a9tygnk" - }, - "outputs": [], - "source": [ - "text = \"\"\"Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a \n", - "habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 53 - }, - "id": "JP29y7cjygqa", - "outputId": "6cdc08a3-d313-495a-c0a6-34366848b27e" - }, - "outputs": [], - "source": [ - "#using stemming in nltk\n", - "\n", - "\n", - "#step1: Word tokenizing\n", - "\n", - "\n", - "\n", - "\n", - "#step2: getting the base form for each token using stemmer\n", - "\n", - "\n", - "\n", - "\n", - "#step3: joining all words in a list into string using 'join()'\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hysIhJuxyg0B", - "outputId": "da5f4be7-48ce-44de-a783-866571f796b9" - }, - "outputs": [], - "source": [ - "#using lemmatisation in spacy\n", - "\n", - "\n", - "#step1: Creating the object for the given text\n", - "\n", - "\n", - "\n", - "#step2: getting the base form for each token using spacy 'lemma_'\n", - "\n", - "\n", - "\n", - "\n", - "#step3: joining all words in a list into string using 'join()'\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "j0PsCauBtG00" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## [**Solution**](./Stemming_and_Lemmatization_Solutions.ipynb)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Stemming and Lemmatization.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From ce7d55470f4e9715afc72fc7405013b87007bb12 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:42:52 +0530 Subject: [PATCH 07/12] Delete Stemming_and_Lemmatization_Solutions.ipynb --- ...Stemming_and_Lemmatization_Solutions.ipynb | 315 ------------------ 1 file changed, 315 deletions(-) delete mode 100644 6_stemming_lematization/Stemming_and_Lemmatization_Solutions.ipynb diff --git a/6_stemming_lematization/Stemming_and_Lemmatization_Solutions.ipynb b/6_stemming_lematization/Stemming_and_Lemmatization_Solutions.ipynb deleted file mode 100644 index 45e5947..0000000 --- a/6_stemming_lematization/Stemming_and_Lemmatization_Solutions.ipynb +++ /dev/null @@ -1,315 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "-eWVL6QPtJqx" - }, - "source": [ - "### **Stemming and Lemmatization: Solutions**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q-myRSrdtGsn" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- **Run this cell to import all necessary packages**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "fn0J_mWHtM57" - }, - "outputs": [], - "source": [ - "#let import necessary libraries and create the object\n", - "\n", - "#for nltk\n", - "import nltk\n", - "from nltk.stem import PorterStemmer\n", - "stemmer = PorterStemmer()\n", - "\n", - "#downloading all neccessary packages related to nltk\n", - "nltk.download('all')\n", - "\n", - "\n", - "#for spacy\n", - "import spacy\n", - "nlp = spacy.load(\"en_core_web_sm\")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "id": "8cmEWYRztM8E" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xCjMD77juhOm" - }, - "source": [ - "**Exercise1:** \n", - "- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations\n", - "- Write a short note on the words that have different base words using stemming and Lemmatization" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5Ytgw50jtM_h", - "outputId": "47b1cf60-7eb6-449c-d136-01289a08abf0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "running | run\n", - "painting | paint\n", - "walking | walk\n", - "dressing | dress\n", - "likely | like\n", - "children | children\n", - "whom | whom\n", - "good | good\n", - "ate | ate\n", - "fishing | fish\n" - ] - } - ], - "source": [ - "#using stemming in nltk\n", - "lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']\n", - "\n", - "for word in lst_words:\n", - " print(f\"{word} | {stemmer.stem(word)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "uU7oPj0ftNCP", - "outputId": "8bf39249-3afb-46a2-9e4e-9fc75be31cc4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "running | run\n", - "painting | painting\n", - "walking | walk\n", - "dressing | dress\n", - "likely | likely\n", - "children | child\n", - "who | who\n", - "good | good\n", - "ate | eat\n", - "fishing | fishing\n" - ] - } - ], - "source": [ - "#using lemmatization in spacy\n", - "\n", - "doc = nlp(\"running painting walking dressing likely children who good ate fishing\")\n", - "for token in doc:\n", - " print(token, \" | \", token.lemma_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FWyzJcFStGvf" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8E9iNZvQtGxk" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CLjsQGjVye5W" - }, - "source": [ - "**Exercise2:**\n", - "\n", - "- convert the given text into it's base form using both stemming and lemmatization" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "id": "0weT7a9tygnk" - }, - "outputs": [], - "source": [ - "text = \"\"\"Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a \n", - "habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 53 - }, - "id": "JP29y7cjygqa", - "outputId": "6cdc08a3-d313-495a-c0a6-34366848b27e" - }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .'" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#using stemming in nltk\n", - "\n", - "#step1: Word tokenizing\n", - "all_word_tokens = nltk.word_tokenize(text)\n", - "\n", - "\n", - "#step2: getting the base form for each token using stemmer\n", - "all_base_words = []\n", - "\n", - "for token in all_word_tokens:\n", - " base_form = stemmer.stem(token)\n", - " all_base_words.append(base_form)\n", - "\n", - "\n", - "#step3: joining all words in a list into string using 'join()'\n", - "final_base_text = ' '.join(all_base_words)\n", - "print(final_base_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hysIhJuxyg0B", - "outputId": "da5f4be7-48ce-44de-a783-866571f796b9" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Latha be very multi talented girl . she be good at many skill like dancing , run , singing , playing . she also like eat Pav Bhagi . she have a \n", - " habit of fishing and swim too . besides all this , she be a wonderful at cooking too . \n", - "\n" - ] - } - ], - "source": [ - "#using lemmatisation in spacy\n", - "\n", - "\n", - "#step1: Creating the object for the given text\n", - "doc = nlp(text)\n", - "all_base_words = []\n", - "\n", - "#step2: getting the base form for each token using spacy 'lemma_'\n", - "for token in doc:\n", - " base_word = token.lemma_\n", - " all_base_words.append(base_word)\n", - "\n", - "\n", - "#step3: joining all words in a list into string using 'join()'\n", - "final_base_text = ' '.join(all_base_words)\n", - "print(final_base_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "j0PsCauBtG00" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Stemming and Lemmatization.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From 78abcfd3d00a2961f7e4dbdd41802b5a843c76cf Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:43:17 +0530 Subject: [PATCH 08/12] Delete Named_Entity_Recognition_Exercise.ipynb --- 8_NER/Named_Entity_Recognition_Exercise.ipynb | 205 ------------------ 1 file changed, 205 deletions(-) delete mode 100644 8_NER/Named_Entity_Recognition_Exercise.ipynb diff --git a/8_NER/Named_Entity_Recognition_Exercise.ipynb b/8_NER/Named_Entity_Recognition_Exercise.ipynb deleted file mode 100644 index 45a26a8..0000000 --- a/8_NER/Named_Entity_Recognition_Exercise.ipynb +++ /dev/null @@ -1,205 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "ibF7pjNyhBrR" - }, - "source": [ - "### **Named Entity Recognition (NER): Exercises**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zmQPX8Rsg_uU" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "liI13P5Eg_w-" - }, - "outputs": [], - "source": [ - "#importing necessary libraries \n", - "import spacy\n", - "\n", - "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FgUhnqY2hKWN" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vNfOBYsWhKX5" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LFR-Ep2DhXXa" - }, - "source": [ - "#### **Excersie: 1**\n", - "\n", - "- Extract all the Geographical (cities, Countries, states) names from a given text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HgYXmU3phKbg", - "outputId": "ff6b90e8-ee46-4017-b185-c7b53f65d625" - }, - "outputs": [], - "source": [ - "text = \"\"\"Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that\n", - "in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,\n", - "in Bihar it is Litti Chowkha and so on for all other states\"\"\"\n", - "\n", - "doc = nlp(text)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Expected Output:**\n", - "\n", - "Geographical location Names: [India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]\n", - "\n", - "Count: 7" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "itW9bgUXhjDW" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "St3zoLXOhjLi" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fJT4UnUYkpSP" - }, - "source": [ - "#### **Excersie: 2**\n", - "\n", - "- Extract all the birth dates of cricketers in the given Text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pOMngJ2chKda", - "outputId": "bbfb7cd5-3088-4503-8d94-aa263aac0608" - }, - "outputs": [], - "source": [ - "text = \"\"\"Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981\n", - "and finally Ricky ponting was born on 19 December 1974.\"\"\"\n", - "\n", - "doc = nlp(text)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Expected Output:**\n", - "\n", - "All Birth Dates: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]\n", - "\n", - "Count: 4" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "v2Bj1o9thKgy" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t_OAhVlahKiv" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## [Solution](./Named_Entity_Recognition_Solutions.ipynb)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Named Entity Recognition (NER).ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From 086e24a0c30b7b519ba98189943d46e6c610e648 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:43:27 +0530 Subject: [PATCH 09/12] Delete Named_Entity_Recognition_Solutions.ipynb --- .../Named_Entity_Recognition_Solutions.ipynb | 223 ------------------ 1 file changed, 223 deletions(-) delete mode 100644 8_NER/Named_Entity_Recognition_Solutions.ipynb diff --git a/8_NER/Named_Entity_Recognition_Solutions.ipynb b/8_NER/Named_Entity_Recognition_Solutions.ipynb deleted file mode 100644 index 090afa8..0000000 --- a/8_NER/Named_Entity_Recognition_Solutions.ipynb +++ /dev/null @@ -1,223 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "ibF7pjNyhBrR" - }, - "source": [ - "### **Named Entity Recognition (NER): Solutions**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zmQPX8Rsg_uU" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "liI13P5Eg_w-" - }, - "outputs": [], - "source": [ - "#importing necessary libraries \n", - "import spacy\n", - "\n", - "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FgUhnqY2hKWN" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vNfOBYsWhKX5" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LFR-Ep2DhXXa" - }, - "source": [ - "#### **Excersie: 1**\n", - "\n", - "- Extract all the Geographical (cities, Countries, states) names from a given text" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HgYXmU3phKbg", - "outputId": "ff6b90e8-ee46-4017-b185-c7b53f65d625" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Geographical location Names: [India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]\n", - "Count: 7\n" - ] - } - ], - "source": [ - "text = \"\"\"Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that\n", - "in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,\n", - "in Bihar it is Litti Chowkha and so on for all other states\"\"\"\n", - "\n", - "doc = nlp(text)\n", - "\n", - "\n", - "\n", - "#list for storing all the names\n", - "all_gpe_names = []\n", - "\n", - "for ent in doc.ents:\n", - " if ent.label_ == 'GPE': #checking the whether token belongs to entity \"GPE\" [Geographical location]\n", - " all_gpe_names.append(ent)\n", - "\n", - "\n", - "\n", - "#finally printing the results\n", - "print(\"Geographical location Names: \", all_gpe_names)\n", - "print(\"Count: \", len(all_gpe_names))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "itW9bgUXhjDW" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "St3zoLXOhjLi" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fJT4UnUYkpSP" - }, - "source": [ - "#### **Excersie: 2**\n", - "\n", - "- Extract all the birth dates of cricketers in the given Text" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pOMngJ2chKda", - "outputId": "bbfb7cd5-3088-4503-8d94-aa263aac0608" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "All Birth Dates: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]\n", - "Count: 4\n" - ] - } - ], - "source": [ - "text = \"\"\"Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981\n", - "and finally Ricky ponting was born on 19 December 1974.\"\"\"\n", - "\n", - "doc = nlp(text)\n", - "\n", - "\n", - "\n", - "#list for storing all the dates\n", - "all_birth_dates = []\n", - "\n", - "for ent in doc.ents:\n", - " if ent.label_ == 'DATE': #checking the whether token belongs to entity \"DATE\" [Dates]\n", - " all_birth_dates.append(ent)\n", - "\n", - "\n", - "\n", - "#finally printing the results\n", - "print(\"All Birth Dates: \", all_birth_dates)\n", - "print(\"Count: \", len(all_birth_dates))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "v2Bj1o9thKgy" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t_OAhVlahKiv" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Named Entity Recognition (NER).ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} From b5a2cc16b943a45031d36498d74048af970dc938 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:49:02 +0530 Subject: [PATCH 10/12] Add files via upload --- .../language_processing_exercise.ipynb | 167 ++++++++++++++++++ ...nguage_processing_exercise_solutions.ipynb | 164 +++++++++++++++++ 2 files changed, 331 insertions(+) create mode 100644 5_spacy_lang_processing_pipeline/language_processing_exercise.ipynb create mode 100644 5_spacy_lang_processing_pipeline/language_processing_exercise_solutions.ipynb diff --git a/5_spacy_lang_processing_pipeline/language_processing_exercise.ipynb b/5_spacy_lang_processing_pipeline/language_processing_exercise.ipynb new file mode 100644 index 0000000..c289a1c --- /dev/null +++ b/5_spacy_lang_processing_pipeline/language_processing_exercise.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Yrci22GYhTQP" + }, + "source": [ + "### **Spacy Language Processing Pipelines: Exercises**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YUMPkcohhgam" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hxtliEGIh4gS" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Get all the proper nouns from a given text in a list and also count how many of them.\n", + "- **Proper Noun** means a noun that names a particular person, place, or thing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lRGfbeEshFf-", + "outputId": "f8d6beed-c03a-479c-b7bd-4a21173aba55" + }, + "outputs": [], + "source": [ + "text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and \n", + "visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.\n", + "They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!\n", + "'''\n", + "\n", + "# https://spacy.io/usage/linguistic-features\n", + "\n", + "#creating the nlp object\n", + "doc = nlp(text) \n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WfU6CRIWhFh8" + }, + "source": [ + "**Expected Output**\n", + "\n", + "Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]\n", + "\n", + "Count: 8\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FUr2rnbYmdlv" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Get all companies names from a given text and also the count of them.\n", + "- **Hint**: Use the spacy **ner** functionality " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LLf4xyGEmZ2P", + "outputId": "e9582d9f-4f1e-4574-e3d8-a5526a4fb6cb" + }, + "outputs": [], + "source": [ + "text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in \n", + "India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''\n", + "\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4JK5eMsCmZ5i" + }, + "source": [ + "**Expected Output**\n", + "\n", + "\n", + "Company Names: [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti Airtel]\n", + "\n", + "Count: 10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HkbNaNVChFoB" + }, + "source": [ + "## [**Solution**](./language_processing_exercise_solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Language Processing_exercise.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/5_spacy_lang_processing_pipeline/language_processing_exercise_solutions.ipynb b/5_spacy_lang_processing_pipeline/language_processing_exercise_solutions.ipynb new file mode 100644 index 0000000..07ccc84 --- /dev/null +++ b/5_spacy_lang_processing_pipeline/language_processing_exercise_solutions.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Yrci22GYhTQP" + }, + "source": [ + "### **Spacy Language Processing Pipelines: Solutions**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "YUMPkcohhgam" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hxtliEGIh4gS" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Get all the proper nouns from a given text in a list and also count how many of them.\n", + "- **Proper Noun** means a noun that names a particular person, place, or thing." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lRGfbeEshFf-", + "outputId": "f8d6beed-c03a-479c-b7bd-4a21173aba55" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]\n", + "Count: 8\n" + ] + } + ], + "source": [ + "text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and \n", + "visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.\n", + "They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!\n", + "'''\n", + "\n", + "# https://spacy.io/usage/linguistic-features\n", + "\n", + "#creating the nlp object\n", + "doc = nlp(text) \n", + "\n", + "\n", + "#list for storing the proper nouns\n", + "all_proper_nouns = [] \n", + "\n", + "\n", + "for token in doc:\n", + " if token.pos_ == \"PROPN\": #checking the whether token belongs to parts of speech \"PROPN\" [Proper Noun]\n", + " all_proper_nouns.append(token)\n", + " \n", + "\n", + "#finally printing the results\n", + "print(\"Proper Nouns: \", all_proper_nouns)\n", + "print(\"Count: \", len(all_proper_nouns))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FUr2rnbYmdlv" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Get all companies names from a given text and also the count of them.\n", + "- **Hint**: Use the spacy **ner** functionality " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LLf4xyGEmZ2P", + "outputId": "e9582d9f-4f1e-4574-e3d8-a5526a4fb6cb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Company Names: [Tesla, Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever, Bharti Airtel]\n", + "Count: 10\n" + ] + } + ], + "source": [ + "text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in \n", + "India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''\n", + "\n", + "\n", + "doc = nlp(text)\n", + "\n", + "#list for storing the company names\n", + "all_company_names = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'ORG': #checking the whether token belongs to entity \"ORG\" [Organisation]\n", + " all_company_names.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"Company Names: \", all_company_names)\n", + "print(\"Count: \", len(all_company_names))" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Language Processing_exercise.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 8a466a971e342033bbee2e5829f477aced24a969 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:49:40 +0530 Subject: [PATCH 11/12] Add files via upload --- .../stemming_and_lemmatization_exercise.ipynb | 203 +++++++++++++ ...stemming_and_lemmatization_solutions.ipynb | 278 ++++++++++++++++++ 2 files changed, 481 insertions(+) create mode 100644 6_stemming_lematization/stemming_and_lemmatization_exercise.ipynb create mode 100644 6_stemming_lematization/stemming_and_lemmatization_solutions.ipynb diff --git a/6_stemming_lematization/stemming_and_lemmatization_exercise.ipynb b/6_stemming_lematization/stemming_and_lemmatization_exercise.ipynb new file mode 100644 index 0000000..a549423 --- /dev/null +++ b/6_stemming_lematization/stemming_and_lemmatization_exercise.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-eWVL6QPtJqx" + }, + "source": [ + "### **Stemming and Lemmatization: Exercises**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **Run this cell to import all necessary packages**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fn0J_mWHtM57" + }, + "outputs": [], + "source": [ + "#let import necessary libraries and create the object\n", + "\n", + "#for nltk\n", + "import nltk\n", + "from nltk.stem import PorterStemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "#downloading all neccessary packages related to nltk\n", + "nltk.download('all')\n", + "\n", + "\n", + "#for spacy\n", + "import spacy\n", + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xCjMD77juhOm" + }, + "source": [ + "**Exercise1:** \n", + "- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations\n", + "- Write a short note on the words that have different base words using stemming and Lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Ytgw50jtM_h", + "outputId": "47b1cf60-7eb6-449c-d136-01289a08abf0" + }, + "outputs": [], + "source": [ + "#using stemming in nltk\n", + "lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uU7oPj0ftNCP", + "outputId": "8bf39249-3afb-46a2-9e4e-9fc75be31cc4" + }, + "outputs": [], + "source": [ + "#using lemmatization in spacy\n", + "\n", + "doc = nlp(\"running painting walking dressing likely children who good ate fishing\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLjsQGjVye5W" + }, + "source": [ + "**Exercise2:**\n", + "\n", + "- convert the given text into it's base form using both stemming and lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0weT7a9tygnk" + }, + "outputs": [], + "source": [ + "text = \"\"\"Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a \n", + "habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "id": "JP29y7cjygqa", + "outputId": "6cdc08a3-d313-495a-c0a6-34366848b27e" + }, + "outputs": [], + "source": [ + "#using stemming in nltk\n", + "\n", + "\n", + "#step1: Word tokenizing\n", + "\n", + "\n", + "\n", + "\n", + "#step2: getting the base form for each token using stemmer\n", + "\n", + "\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hysIhJuxyg0B", + "outputId": "da5f4be7-48ce-44de-a783-866571f796b9" + }, + "outputs": [], + "source": [ + "#using lemmatisation in spacy\n", + "\n", + "\n", + "#step1: Creating the object for the given text\n", + "\n", + "\n", + "\n", + "#step2: getting the base form for each token using spacy 'lemma_'\n", + "\n", + "\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [**Solution**](./stemming_and_lemmatization_solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Stemming and Lemmatization.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/6_stemming_lematization/stemming_and_lemmatization_solutions.ipynb b/6_stemming_lematization/stemming_and_lemmatization_solutions.ipynb new file mode 100644 index 0000000..80a6be8 --- /dev/null +++ b/6_stemming_lematization/stemming_and_lemmatization_solutions.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-eWVL6QPtJqx" + }, + "source": [ + "### **Stemming and Lemmatization: Solutions**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qWR0ho4HGnWL" + }, + "source": [ + "- **Run this cell to import all necessary packages**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fn0J_mWHtM57" + }, + "outputs": [], + "source": [ + "#let import necessary libraries and create the object\n", + "\n", + "#for nltk\n", + "import nltk\n", + "from nltk.stem import PorterStemmer\n", + "stemmer = PorterStemmer()\n", + "\n", + "#downloading all neccessary packages related to nltk\n", + "nltk.download('all')\n", + "\n", + "\n", + "#for spacy\n", + "import spacy\n", + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xCjMD77juhOm" + }, + "source": [ + "**Exercise1:** \n", + "- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations\n", + "- Write a short note on the words that have different base words using stemming and Lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Ytgw50jtM_h", + "outputId": "ad4ffe2a-698e-47f4-cb9d-c9b0d6501dcb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running | run\n", + "painting | paint\n", + "walking | walk\n", + "dressing | dress\n", + "likely | like\n", + "children | children\n", + "whom | whom\n", + "good | good\n", + "ate | ate\n", + "fishing | fish\n" + ] + } + ], + "source": [ + "#using stemming in nltk\n", + "lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']\n", + "\n", + "for word in lst_words:\n", + " print(f\"{word} | {stemmer.stem(word)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uU7oPj0ftNCP", + "outputId": "a6af545c-b1ca-462b-dd28-9d8b6b713185" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running | run\n", + "painting | painting\n", + "walking | walk\n", + "dressing | dress\n", + "likely | likely\n", + "children | child\n", + "whom | whom\n", + "good | good\n", + "ate | eat\n", + "fishing | fishing\n" + ] + } + ], + "source": [ + "#using lemmatization in spacy\n", + "\n", + "doc = nlp(\"running painting walking dressing likely children whom good ate fishing\")\n", + "for token in doc:\n", + " print(token, \" | \", token.lemma_)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zIM2fumjGyxe" + }, + "source": [ + "**Observations**\n", + "\n", + "- Words that are different in stemming and lemmatization are:\n", + " - painting\n", + " - likely\n", + " - children\n", + " - ate\n", + " - fishing\n", + "\n", + "- As Stemming achieves the base word by removing the **suffixes** [ing, ly etc], so it successfully transform the words like 'painting', 'likely', 'fishing' and lemmatization fails for some words ending with suffixes here.\n", + "\n", + "- As Lemmatization uses the **dictionary** meanings while converting to the base form, so words like 'children' and 'ate' are successfully transformed and stemming fails here." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLjsQGjVye5W" + }, + "source": [ + "**Exercise2:**\n", + "\n", + "- convert the given text into it's base form using both stemming and lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "0weT7a9tygnk" + }, + "outputs": [], + "source": [ + "text = \"\"\"Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a \n", + "habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JP29y7cjygqa", + "outputId": "da12a10f-799b-4320-b575-4e86d21fa43e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .\n" + ] + } + ], + "source": [ + "#using stemming in nltk\n", + "\n", + "#step1: Word tokenizing\n", + "all_word_tokens = nltk.word_tokenize(text)\n", + "\n", + "\n", + "#step2: getting the base form for each token using stemmer\n", + "all_base_words = []\n", + "\n", + "for token in all_word_tokens:\n", + " base_form = stemmer.stem(token)\n", + " all_base_words.append(base_form)\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n", + "final_base_text = ' '.join(all_base_words)\n", + "print(final_base_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hysIhJuxyg0B", + "outputId": "c76eeb91-a7b7-4f8e-f3c0-f0f4692580c5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Latha be very multi talented girl . she be good at many skill like dancing , run , singing , playing . she also like eat Pav Bhagi . she have a \n", + " habit of fishing and swim too . besides all this , she be a wonderful at cooking too . \n", + "\n" + ] + } + ], + "source": [ + "#using lemmatisation in spacy\n", + "\n", + "\n", + "#step1: Creating the object for the given text\n", + "doc = nlp(text)\n", + "all_base_words = []\n", + "\n", + "#step2: getting the base form for each token using spacy 'lemma_'\n", + "for token in doc:\n", + " base_word = token.lemma_\n", + " all_base_words.append(base_word)\n", + "\n", + "\n", + "#step3: joining all words in a list into string using 'join()'\n", + "final_base_text = ' '.join(all_base_words)\n", + "print(final_base_text)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "stemming_and_lemmatization_solutions.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From ca2ad8228709c34e592029431d9b8b6d12fb25f8 Mon Sep 17 00:00:00 2001 From: Marala Kirandeep Date: Wed, 13 Jul 2022 19:50:16 +0530 Subject: [PATCH 12/12] Add files via upload --- 8_NER/named_entity_recognition_exercise.ipynb | 151 +++++++++++++++++ .../named_entity_recognition_solutions.ipynb | 160 ++++++++++++++++++ 2 files changed, 311 insertions(+) create mode 100644 8_NER/named_entity_recognition_exercise.ipynb create mode 100644 8_NER/named_entity_recognition_solutions.ipynb diff --git a/8_NER/named_entity_recognition_exercise.ipynb b/8_NER/named_entity_recognition_exercise.ipynb new file mode 100644 index 0000000..1ff3860 --- /dev/null +++ b/8_NER/named_entity_recognition_exercise.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ibF7pjNyhBrR" + }, + "source": [ + "### **Named Entity Recognition (NER): Exercises**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "liI13P5Eg_w-" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFR-Ep2DhXXa" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Extract all the Geographical (cities, Countries, states) names from a given text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HgYXmU3phKbg", + "outputId": "ff6b90e8-ee46-4017-b185-c7b53f65d625" + }, + "outputs": [], + "source": [ + "text = \"\"\"Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that\n", + "in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,\n", + "in Bihar it is Litti Chowkha and so on for all other states\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "\n", + "Geographical location Names: [India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]\n", + "\n", + "Count: 7" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fJT4UnUYkpSP" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Extract all the birth dates of cricketers in the given Text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pOMngJ2chKda", + "outputId": "bbfb7cd5-3088-4503-8d94-aa263aac0608" + }, + "outputs": [], + "source": [ + "text = \"\"\"Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981\n", + "and finally Ricky ponting was born on 19 December 1974.\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Expected Output:**\n", + "\n", + "All Birth Dates: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]\n", + "\n", + "Count: 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [Solution](./named_entity_recognition_solutions.ipynb)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Named Entity Recognition (NER).ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/8_NER/named_entity_recognition_solutions.ipynb b/8_NER/named_entity_recognition_solutions.ipynb new file mode 100644 index 0000000..49e1ff9 --- /dev/null +++ b/8_NER/named_entity_recognition_solutions.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ibF7pjNyhBrR" + }, + "source": [ + "### **Named Entity Recognition (NER): Solutions**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "liI13P5Eg_w-" + }, + "outputs": [], + "source": [ + "#importing necessary libraries \n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_sm\") #creating an object and loading the pre-trained model for \"English\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFR-Ep2DhXXa" + }, + "source": [ + "#### **Excersie: 1**\n", + "\n", + "- Extract all the Geographical (cities, Countries, states) names from a given text" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HgYXmU3phKbg", + "outputId": "ff6b90e8-ee46-4017-b185-c7b53f65d625" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Geographical location Names: [India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]\n", + "Count: 7\n" + ] + } + ], + "source": [ + "text = \"\"\"Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that\n", + "in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,\n", + "in Bihar it is Litti Chowkha and so on for all other states\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "#list for storing all the names\n", + "all_gpe_names = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'GPE': #checking the whether token belongs to entity \"GPE\" [Geographical location]\n", + " all_gpe_names.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"Geographical location Names: \", all_gpe_names)\n", + "print(\"Count: \", len(all_gpe_names))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fJT4UnUYkpSP" + }, + "source": [ + "#### **Excersie: 2**\n", + "\n", + "- Extract all the birth dates of cricketers in the given Text" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pOMngJ2chKda", + "outputId": "bbfb7cd5-3088-4503-8d94-aa263aac0608" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All Birth Dates: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]\n", + "Count: 4\n" + ] + } + ], + "source": [ + "text = \"\"\"Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981\n", + "and finally Ricky ponting was born on 19 December 1974.\"\"\"\n", + "\n", + "doc = nlp(text)\n", + "\n", + "\n", + "\n", + "#list for storing all the dates\n", + "all_birth_dates = []\n", + "\n", + "for ent in doc.ents:\n", + " if ent.label_ == 'DATE': #checking the whether token belongs to entity \"DATE\" [Dates]\n", + " all_birth_dates.append(ent)\n", + "\n", + "\n", + "\n", + "#finally printing the results\n", + "print(\"All Birth Dates: \", all_birth_dates)\n", + "print(\"Count: \", len(all_birth_dates))" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Named Entity Recognition (NER).ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}