From bf1801f7a10bc343ff97c53b64b9f445f6ae7376 Mon Sep 17 00:00:00 2001 From: anass236 Date: Mon, 13 Jan 2020 02:51:32 +0100 Subject: [PATCH 1/2] Created using Colaboratory --- [IA_Niv2]_NLP_TP_MARAH_Anass.ipynb | 2284 ++++++++++++++++++++++++++++ 1 file changed, 2284 insertions(+) create mode 100644 [IA_Niv2]_NLP_TP_MARAH_Anass.ipynb diff --git a/[IA_Niv2]_NLP_TP_MARAH_Anass.ipynb b/[IA_Niv2]_NLP_TP_MARAH_Anass.ipynb new file mode 100644 index 00000000..052398d4 --- /dev/null +++ b/[IA_Niv2]_NLP_TP_MARAH_Anass.ipynb @@ -0,0 +1,2284 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "colab": { + "name": "[IA_Niv2] NLP_TP_MARAH_Anass.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true, + "include_colab_link": true + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B8hAa9wS67FD", + "colab_type": "text" + }, + "source": [ + "#**Introduction**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8HenIWJbpaCj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import numpy as np\n", + "import pandas as pd" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "MraTQd-jpaCq", + "colab_type": "code", + "colab": {} + }, + "source": [ + "spam = pd.read_csv('spam.csv')" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "mhcBYtTqpaCt", + "colab_type": "code", + "outputId": "1bde3f20-9e4c-481f-8ab6-61224706f41f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 195 + } + }, + "source": [ + "spam.head()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", + "
" + ], + "text/plain": [ + " Category Message\n", + "0 ham Go until jurong point, crazy.. Available only ...\n", + "1 ham Ok lar... Joking wif u oni...\n", + "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", + "3 ham U dun say so early hor... U c already then say...\n", + "4 ham Nah I don't think he goes to usf, he lives aro..." + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 5 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D93tkBtlp7uh", + "colab_type": "text" + }, + "source": [ + "**Le but** de cette démonstration est d'appliquer les principaux fonctionalités de Natural Language Processing, il s'agit de lemmentazing and stemming, on va travailler sur un DataSet des émails composé de deux columns:\n", + "\n", + "\n", + "1. Category: les types de messages soit (\"ham\"/\"spam\")\n", + "2. Message: message écrit sur l'email\n", + "\n", + "Après, on va essayer de prédire la catégorie de chaque message, à l'aide des algorithmes de classifications" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "JuNlg_yXpaCz", + "colab_type": "code", + "outputId": "c72bce71-5933-4f1f-bf68-e118a398e264", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + } + }, + "source": [ + "# Installer les librairies nécessaires pour cette étude:\n", + "\n", + "!pip install nltk\n", + "\n", + "import nltk\n", + "\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "nltk.download('wordnet')" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.12.0)\n", + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/wordnet.zip.\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bY66z-NhpaC4", + "colab_type": "text" + }, + "source": [ + "# **1- Suppression des stopwords de tous les messages**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "z2qdjnkGpaC5", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "\n", + "stop_words = set(stopwords.words('english'))\n", + "# Assign the messages without stopwords to another column\n", + "spam['Message_without_stopwords'] = spam.Message.apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "p1nNJ83BpaC9", + "colab_type": "code", + "colab": {} + }, + "source": [ + "spam['Message_with_stopwords'] = spam.Message" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jcErxwmxpaDz", + "colab_type": "code", + "colab": {} + }, + "source": [ + "spam.drop(columns=['Message'],inplace=True)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YK2CNic4paDD", + "colab_type": "text" + }, + "source": [ + "# **2- Application du stemming sur les messages sans stopwords**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oZIBQesipaDE", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from nltk.stem import SnowballStemmer\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "stemmer = SnowballStemmer('english')\n", + "spam['Message_stemmed_without_StopWords'] = spam.Message_without_stopwords.apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "alk_IGb6paDL", + "colab_type": "text" + }, + "source": [ + "# **3- Application de la lematization sur les messages sans stopwords (mais pas après stemming)**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "rP2LlXsvpaDN", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from nltk.stem import WordNetLemmatizer \n", + "\n", + "lemmatizer = WordNetLemmatizer()\n", + "spam['Message_lemmatized_before_stemming'] = spam.Message_without_stopwords.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nYZjzxGjpaDT", + "colab_type": "text" + }, + "source": [ + "# **4- Comparation des résultats entre 2 et 3 (2-3 phrases suffisent)**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aRuLdRTzpaDV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# show the totality of message in our dataSet\n", + "pd.set_option('display.max_colwidth', -1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "y4FLI0AgpaDZ", + "colab_type": "code", + "outputId": "2b41d1ee-d5fb-4efc-ee7e-6e79f26d3779", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 511 + } + }, + "source": [ + "spam[['Message_without_stopwords','Message_stemmed_without_StopWords','Message_lemmatized_before_stemming']].head(10)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Message_without_stopwordsMessage_stemmed_without_StopWordsMessage_lemmatized_before_stemming
0Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ...go jurong point , crazy.. avail bugi n great world la e buffet ... cine got amor wat ...Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ...
1Ok lar ... Joking wif u oni ...ok lar ... joke wif u oni ...Ok lar ... Joking wif u oni ...
2Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 'sfree entri 2 wkli comp win fa cup final tkts 21st may 2005 . text fa 87121 receiv entri question ( std txt rate ) t & c 's appli 08452810075over18 'sFree entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's
3U dun say early hor ... U c already say ...u dun say earli hor ... u c alreadi say ...U dun say early hor ... U c already say ...
4Nah I n't think goes usf , lives around thoughnah i n't think goe usf , live around thoughNah I n't think go usf , life around though
5FreeMsg Hey darling 's 3 week 's word back ! I 'd like fun still ? Tb ok ! XxX std chgs send , £1.50 rcvfreemsg hey darl 's 3 week 's word back ! i 'd like fun still ? tb ok ! xxx std chgs send , £1.50 rcvFreeMsg Hey darling 's 3 week 's word back ! I 'd like fun still ? Tb ok ! XxX std chgs send , £1.50 rcv
6Even brother like speak . They treat like aids patent .even brother like speak . they treat like aid patent .Even brother like speak . They treat like aid patent .
7As per request 'Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . Press *9 copy friends Callertuneas per request mell mell ( oru minnaminungint nurungu vettam ) ' set callertun caller . press *9 copi friend callertunAs per request 'Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . Press *9 copy friend Callertune
8WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hours .winner ! ! as valu network custom select receivea £900 prize reward ! to claim call 09061701461 . claim code kl341 . valid 12 hour .WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hour .
9Had mobile 11 months ? U R entitled Update latest colour mobiles camera Free ! Call The Mobile Update Co FREE 08002986030had mobil 11 month ? u r entitl updat latest colour mobil camera free ! call the mobil updat co free 08002986030Had mobile 11 month ? U R entitled Update latest colour mobile camera Free ! Call The Mobile Update Co FREE 08002986030
\n", + "
" + ], + "text/plain": [ + " Message_without_stopwords ... Message_lemmatized_before_stemming\n", + "0 Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ... ... Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ... \n", + "1 Ok lar ... Joking wif u oni ... ... Ok lar ... Joking wif u oni ... \n", + "2 Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's ... Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's\n", + "3 U dun say early hor ... U c already say ... ... U dun say early hor ... U c already say ... \n", + "4 Nah I n't think goes usf , lives around though ... Nah I n't think go usf , life around though \n", + "5 FreeMsg Hey darling 's 3 week 's word back ! I 'd like fun still ? Tb ok ! XxX std chgs send , £1.50 rcv ... FreeMsg Hey darling 's 3 week 's word back ! I 'd like fun still ? Tb ok ! XxX std chgs send , £1.50 rcv \n", + "6 Even brother like speak . They treat like aids patent . ... Even brother like speak . They treat like aid patent . \n", + "7 As per request 'Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . Press *9 copy friends Callertune ... As per request 'Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . Press *9 copy friend Callertune \n", + "8 WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hours . ... WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hour . \n", + "9 Had mobile 11 months ? U R entitled Update latest colour mobiles camera Free ! Call The Mobile Update Co FREE 08002986030 ... Had mobile 11 month ? U R entitled Update latest colour mobile camera Free ! Call The Mobile Update Co FREE 08002986030 \n", + "\n", + "[10 rows x 3 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2ykAiHa_paDf", + "colab_type": "text" + }, + "source": [ + "If we look to phrase **'Had mobile 11 months ? U R entitled Update latest colour mobiles camera Free ! Call The Mobile Update Co FREE 08002986030'** specially verb *\"entitled\"* it is in the past simple, so normally when we stemmed we should have ***'entitle'*** but the result has ***'entitl'*** even the lemmatized he keeped the ***\"ed\"***. The same thing applies to message **'WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hours .\t\"** with the verbs \"select, value\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7aMTeMi-paDi", + "colab_type": "text" + }, + "source": [ + "# **5- Exécutez une TF-IDF sur les messages après Stemming ou Lemmatization**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlGe8aRltBE0", + "colab_type": "text" + }, + "source": [ + "\n", + "\n", + "* **sublinear_df** est mis on True pour utiliser la forme logarithmique de la fréquence.\n", + "* **min_df** est le nombre minimum de documents dans lesquels un mot doit être présent pour être conservé.\n", + "* **norm is** est mis en \"l2\", pour garantir que tous nos vecteurs de caractéristiques ont une norme euclidienne de 1.\n", + "* **ngram_range** est réglé sur (1, 2) pour indiquer que nous voulons considérer à la fois les unigrammes (un mot) et les bigrammes (deux mots).\n", + "\n", + "Pour plus de détails, voir: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ioikwpoYxGCV", + "colab_type": "text" + }, + "source": [ + "## 5.1. Le cas du Lemmatizing" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Nqih3IykpaDj", + "colab_type": "code", + "outputId": "822a67f2-56d2-4a57-cf66-43347d01f360", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 402 + } + }, + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "corpus_lemm = spam.Message_lemmatized_before_stemming\n", + "\n", + "#Initialisation du TfidfVectorizer\n", + "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))\n", + "\n", + "#transformation du message Lemmatizing\n", + "Y = vectorizer.fit_transform(corpus_lemm)\n", + "\n", + "# on crée une dataFrame pour avoir les mots les plus fréquents et son TFIDF (ce fréquence est en pourcentage)\n", + "df2 = pd.DataFrame(np.round(Y[0].T.todense() *100,2), index=vectorizer.get_feature_names(), columns=[\"tfidf % - lemmatize\"])\n", + "df2.sort_values(by=[\"tfidf % - lemmatize\"],ascending=False)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tfidf % - lemmatize
bugis36.84
la36.84
cine36.84
crazy33.77
available32.62
......
friends0.00
friendship0.00
friendship good0.00
frm0.00
zed0.00
\n", + "

2566 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " tfidf % - lemmatize\n", + "bugis 36.84 \n", + "la 36.84 \n", + "cine 36.84 \n", + "crazy 33.77 \n", + "available 32.62 \n", + "... ... \n", + "friends 0.00 \n", + "friendship 0.00 \n", + "friendship good 0.00 \n", + "frm 0.00 \n", + "zed 0.00 \n", + "\n", + "[2566 rows x 1 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 14 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jLAkFcY4xM52", + "colab_type": "text" + }, + "source": [ + "## 5.2. Le cas du Stemming" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "C3xTSLELpaDt", + "colab_type": "code", + "outputId": "53e31058-e233-468b-e580-bd3805c53488", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 402 + } + }, + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "corpus_stem = spam.Message_stemmed_without_StopWords\n", + "\n", + "#Initialisation du TfidfVectorizer\n", + "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))\n", + "\n", + "#transformation du message stemmé\n", + "X = vectorizer.fit_transform(corpus_stem)\n", + "\n", + "# on crée une dataFrame pour avoir les mots les plus fréquents et son TFIDF (ce fréquence est en pourcentage)\n", + "df1 = pd.DataFrame(np.round(X[0].T.todense() *100,2), index=vectorizer.get_feature_names(), columns=[\"tfidf % - stemmed\"])\n", + "df1.sort_values(by=[\"tfidf % - stemmed\"],ascending=False)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tfidf % - stemmed
la39.31
bugi39.31
cine39.31
avail35.38
point31.77
......
friend callertun0.00
friend repli0.00
friends0.00
friendship0.00
zed0.00
\n", + "

2534 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " tfidf % - stemmed\n", + "la 39.31 \n", + "bugi 39.31 \n", + "cine 39.31 \n", + "avail 35.38 \n", + "point 31.77 \n", + "... ... \n", + "friend callertun 0.00 \n", + "friend repli 0.00 \n", + "friends 0.00 \n", + "friendship 0.00 \n", + "zed 0.00 \n", + "\n", + "[2534 rows x 1 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 15 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_6tF5a_3u_5C", + "colab_type": "text" + }, + "source": [ + "# **6 (Bonus): Lancez une classification pour détecter les *spams***" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LtzJ3lqhpaD-", + "colab_type": "code", + "outputId": "8cd7b995-8324-44fb-b612-ff9b0925ab21", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + } + }, + "source": [ + "spam.Category.value_counts()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "ham 4825\n", + "spam 747 \n", + "Name: Category, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 16 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yDR1J_lDx0sB", + "colab_type": "text" + }, + "source": [ + "On remarque le nombre de message par catégorie est déséquilibré, il est biasé sur le ham que le spam" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "S2v7WbAJyG0L", + "colab_type": "code", + "outputId": "befe447c-2f37-4edc-aa84-471830a9d314", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 422 + } + }, + "source": [ + "import matplotlib.pyplot as plt\n", + "spam.groupby('Category').Message_with_stopwords.count().plot.bar(ylim=0,figsize=(8,6))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 17 + }, + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAewAAAGECAYAAAD0jRNXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAVRklEQVR4nO3dfZBdd33f8c8XC3BaAjZYOFQyyAnu\ntIYASTTGTNJOgKlsMKnIQKgzKajEU/1R2gmdNsG0MC5PLQzT8JBpSFzsRmYA46ZQK8EEPOYhDcVg\nGYMfeBgUY8ZWASvIOBBqasO3f9yzZGMk78q+2stv9/Wa2dlzfufce393Rtr3nnvPnlvdHQDgR9uD\nFj0BAGBlgg0AAxBsABiAYAPAAAQbAAawadETuC8nnXRSb9u2bdHTAIA1c+211/5Fd2++9/iqgl1V\ntyT5VpLvJbmnu7dX1SOTvCfJtiS3JHlBd99RVZXkLUmeneQ7Sf5Zd396up9dSV4x3e1ru3vPfT3u\ntm3bsm/fvtVMEQDWhar6yuHGj+Yl8ad391O6e/u0fn6Sq7r7tCRXTetJ8qwkp01fu5O8bZrAI5Nc\nkOSpSc5IckFVnXi0TwQANqIH8h72ziRLR8h7kjx32fglPXN1khOq6jFJzkpyZXcf6u47klyZ5OwH\n8PgAsGGsNtid5ENVdW1V7Z7GTu7ur07LX0ty8rS8Jcmty2572zR2pHEAYAWrPensF7r7QFU9OsmV\nVfWF5Ru7u6tqLtc4nX4h2J0kj33sY+dxlwAwvFUdYXf3gen77Unel9l70F+fXurO9P32afcDSU5Z\ndvOt09iRxu/9WBd29/bu3r558w+dJAcAG9KKwa6qv11VP760nGRHkhuT7E2ya9ptV5LLp+W9SV5U\nM2cmuXN66fyDSXZU1YnTyWY7pjEAYAWreUn85CTvm/21VjYleVd3/0lVXZPksqo6L8lXkrxg2v+K\nzP6ka39mf9b14iTp7kNV9Zok10z7vbq7D83tmQDAOlY/yh+vuX379vZ32ABsJFV17bI/of4BlyYF\ngAEINgAMQLABYACCDQADEGwAGIBgA8AAfqQ/D3u923b++xc9BR6AW15/zqKnAGwgjrABYACCDQAD\nEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoAB\nCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AA\nBBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoABCDYADECwAWAA\ngg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACrDnZVHVdV11XVH0/rp1bVJ6tqf1W9\np6oeMo0/dFrfP23ftuw+Xj6Nf7Gqzpr3kwGA9epojrB/I8nnl62/IcmbuvvxSe5Ict40fl6SO6bx\nN037papOT3JukickOTvJ71bVcQ9s+gCwMawq2FW1Nck5Sd4+rVeSZyT5w2mXPUmeOy3vnNYzbX/m\ntP/OJJd293e7+8tJ9ic5Yx5PAgDWu9UeYb85yW8l+f60/qgk3+zue6b125JsmZa3JLk1Sabtd077\n/2D8MLf5garaXVX7qmrfwYMHj+KpAMD6tWKwq+o5SW7v7mvXYD7p7gu7e3t3b9+8efNaPCQA/Mjb\ntIp9fj7JP66qZyc5PsnDk7wlyQlVtWk6it6a5MC0/4EkpyS5rao2JXlEkm8sG1+y/DYAwH1Y8Qi7\nu1/e3Vu7e1tmJ419uLt/LclHkjx/2m1Xksun5b3TeqbtH+7unsbPnc4iPzXJaUk+NbdnAgDr2GqO\nsI/kZUkurarXJrkuyUXT+EVJ3lFV+5Mcyizy6e6bquqyJJ9Lck+Sl3T39x7A4wPAhnFUwe7ujyb5\n6LR8cw5zlnd335XkV45w+9cled3RThIANjpXOgOAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAG\nINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQAD\nEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoAB\nCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AA\nBBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGsGKwq+r4qvpUVX22qm6qqldN\n46dW1Seran9VvaeqHjKNP3Ra3z9t37bsvl4+jX+xqs46Vk8KANab1RxhfzfJM7r7yUmekuTsqjoz\nyRuSvKm7H5/kjiTnTfufl+SOafxN036pqtOTnJvkCUnOTvK7VXXcPJ8MAKxXKwa7Z749rT54+uok\nz0jyh9P4niTPnZZ3TuuZtj+zqmoav7S7v9vdX06yP8kZc3kWALDOreo97Ko6rqo+k+T2JFcm+fMk\n3+zue6ZdbkuyZVrekuTWJJm235nkUcvHD3Ob5Y+1u6r2VdW+gwcPHv0zAoB1aFXB7u7vdfdTkmzN\n7Kj47x2rCXX3hd29vbu3b968+Vg9DAAM5ajOEu/ubyb5SJKnJTmhqjZNm7YmOTAtH0hySpJM2x+R\n5BvLxw9zGwDgPqzmLPHNVXXCtPxjSf5Rks9nFu7nT7vtSnL5tLx3Ws+0/cPd3dP4udNZ5KcmOS3J\np+b1RABgPdu08i55TJI90xndD0pyWXf/cVV9LsmlVfXaJNcluWja/6Ik76iq/UkOZXZmeLr7pqq6\nLMnnktyT5CXd/b35Ph0AWJ9WDHZ3X5/kZw4zfnMOc5Z3d9+V5FeOcF+vS/K6o58mAGxsrnQGAAMQ\nbAAYgGADwAAEGwAGINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEI\nNgAMQLABYACCDQADEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAE\nGwAGINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACC\nDQADEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADB\nBoABCDYADECwAWAAKwa7qk6pqo9U1eeq6qaq+o1p/JFVdWVVfWn6fuI0XlX11qraX1XXV9XPLruv\nXdP+X6qqXcfuaQHA+rKaI+x7kvyb7j49yZlJXlJVpyc5P8lV3X1akqum9SR5VpLTpq/dSd6WzAKf\n5IIkT01yRpILliIPANy3FYPd3V/t7k9Py99K8vkkW5LsTLJn2m1PkudOyzuTXNIzVyc5oaoek+Ss\nJFd296HuviPJlUnOnuuzAYB16qjew66qbUl+Jsknk5zc3V+dNn0tycnT8pYkty672W3T2JHG7/0Y\nu6tqX1XtO3jw4NFMDwDWrVUHu6oeluR/JHlpd//l8m3d3Ul6HhPq7gu7e3t3b9+8efM87hIAhreq\nYFfVgzOL9Tu7+73T8Nenl7ozfb99Gj+Q5JRlN986jR1pHABYwWrOEq8kFyX5fHf/9rJNe5Msnem9\nK8nly8ZfNJ0tfmaSO6eXzj+YZEdVnTidbLZjGgMAVrBpFfv8fJIXJrmhqj4zjf27JK9PcllVnZfk\nK0leMG27Ismzk+xP8p0kL06S7j5UVa9Jcs2036u7+9BcngUArHMrBru7/yxJHWHzMw+zfyd5yRHu\n6+IkFx/NBAEAVzoDgCEINgAMQLABYACCDQADEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFg\nAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAw\nAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAY\ngGADwAAEGwAGINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAM\nQLABYACCDQADEGwAGIBgA8AABBsABrBisKvq4qq6vapuXDb2yKq6sqq+NH0/cRqvqnprVe2vquur\n6meX3WbXtP+XqmrXsXk6ALA+reYI+w+SnH2vsfOTXNXdpyW5alpPkmclOW362p3kbcks8EkuSPLU\nJGckuWAp8gDAylYMdnf/aZJD9xremWTPtLwnyXOXjV/SM1cnOaGqHpPkrCRXdveh7r4jyZX54V8C\nAIAjuL/vYZ/c3V+dlr+W5ORpeUuSW5ftd9s0dqTxH1JVu6tqX1XtO3jw4P2cHgCsLw/4pLPu7iQ9\nh7ks3d+F3b29u7dv3rx5XncLAEO7v8H++vRSd6bvt0/jB5Kcsmy/rdPYkcYBgFW4v8Hem2TpTO9d\nSS5fNv6i6WzxM5PcOb10/sEkO6rqxOlksx3TGACwCptW2qGq3p3kF5OcVFW3ZXa29+uTXFZV5yX5\nSpIXTLtfkeTZSfYn+U6SFydJdx+qqtckuWba79Xdfe8T2QCAI1gx2N39q0fY9MzD7NtJXnKE+7k4\nycVHNTsAIIkrnQHAEAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AABBsABrDipUkB\n1ptt579/0VPgAbjl9ecsegoL4QgbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AABBsA\nBiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoABCDYADECwAWAAgg0A\nAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AABBsABiDYADAAwQaA\nAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoABCDYADGDNg11VZ1fVF6tqf1Wdv9aP\nDwAjWtNgV9VxSf5LkmclOT3Jr1bV6Ws5BwAY0VofYZ+RZH9339zd/y/JpUl2rvEcAGA4ax3sLUlu\nXbZ+2zQGANyHTYuewL1V1e4ku6fVb1fVFxc5Hx6Qk5L8xaIncazUGxY9Azgi//fG9rjDDa51sA8k\nOWXZ+tZp7Ae6+8IkF67lpDg2qmpfd29f9Dxgo/F/b31a65fEr0lyWlWdWlUPSXJukr1rPAcAGM6a\nHmF39z1V9S+TfDDJcUku7u6b1nIOADCiNX8Pu7uvSHLFWj8uC+GtDVgM//fWoeruRc8BAFiBS5MC\nwAAEGwAGINgAMIAfuQunML6qelKSbVn276u737uwCQGsA4LNXFXVxUmelOSmJN+fhjuJYMMxNH24\n0jn54V+Wf3tRc2K+BJt5O7O7fQIbrL0/SnJXkhvy178ss44INvP2iao6vbs/t+iJwAaztbuftOhJ\ncOwINvN2SWbR/lqS7yapJO0HCRxzH6iqHd39oUVPhGNDsJm3i5K8MF6Wg7V2dZL3VdWDktydv/5l\n+eGLnRbz4kpnzFVVfaK7n7boecBGU1VfTrIzyQ3tB/u65Aibebuuqt6V2Qkw310a9GddcMzdmuRG\nsV6/BJt5+7HMQr1j2Zg/64Jj7+YkH62qD+Rv/rLsz7rWCcFmrrr7xYueA2xQX56+HjJ9sc54D5u5\nqqrjk5yX5AlJjl8a7+5fX9ikANYB1xJn3t6R5CeSnJXkY0m2JvnWQmcEG0BVba6qN1bVFVX14aWv\nRc+L+RFs5u3x3f3KJH/V3Xsyu1TiUxc8J9gI3pnkC0lOTfKqJLckuWaRE2K+BJt5u3v6/s2qemKS\nRyR59ALnAxvFo7r7oiR3d/fHprehnrHoSTE/Tjpj3i6sqhOTvCLJ3iQPS/LKxU4JNoSlX5a/WlXn\nJPk/SR65wPkwZ046Y66q6qFJnpfZJwY9eBru7n71wiYFG0BVPSfJ/0pySpLfSfLwJK/q7r0LnRhz\nI9jMVVX9SZI7k1yb5HtL4939nxc2KYB1QLCZq6q6sbufuOh5wEZTVT+Z5C1JnpbZdfw/keRfd/fN\nC50Yc+OkM+btf1fVTy96ErABvSvJZZn9WeXfSfLfk7x7oTNirhxhMxdVdUNmlyDdlOS0zC6T6OM1\nYY1U1fX3/n9WVZ/t7icvak7Ml2AzF1X1uPva3t1fWau5wEZUVW9IckeSSzP75fmfJDkxyRuTpLsP\nLW52zINgA6wD08drLln6wV5L6939k2s8JebMe9gA68PLkjy5u09N8t+SfDbJ87r7VLFeHwQbYH14\nRXf/ZVX9QmZXOHt7krcteE7MkWADrA9L1z04J8l/7e73x8dsriuCDbA+HKiq38/sZLMrpqsO+hm/\njjjpDGAdqKq/leTsJDd095eq6jFJfrq7P7TgqTEngg0AA/ByCQAMQLABYACCDYOrqp+oqkur6s+r\n6tqquqKq/u4R9j2hqv7FWs8ReOAEGwZWVZXkfUk+2t0/1d0/l+TlSU4+wk1OSHLMg11Vm471Y8BG\nI9gwtqcnubu7f29poLs/m+S6qrqqqj5dVTdU1c5p8+uT/FRVfaaq3pgkVfWbVXVNVV1fVa9aup+q\nemVVfbGq/qyq3l1V/3Yaf0pVXT3t/76qOnEa/2hVvbmq9iX591X15ap68LTt4cvXgaPnt2AY2xOT\nXHuY8buS/PJ05auTklxdVXuTnJ/kid39lCSpqh2ZfbraGZldd3pvVf3DJP83yfOSPDnJg5N8etnj\nXJLkX3X3x6rq1UkuSPLSadtDunv7dN/bMruIx/9Mcm6S93b33XN87rChCDasT5XkP07x/X6SLTn8\ny+Q7pq/rpvWHZRbwH09yeXffleSuqvqjJKmqRyQ5obs/Nu2/J7PPXV7ynmXLb0/yW5kF+8VJ/vkc\nnhdsWIINY7spyfMPM/5rSTYn+bnuvruqbkly/GH2qyT/qbt//28MVr30MPuuxl8tLXT3x6tqW1X9\nYpLjuvvG+3mfQLyHDaP7cJKHVtXupYGqelKSxyW5fYr106f1JPlWZkfPSz6Y5Ner6mHTbbdU1aOT\nfDzJL1XV8dO25yRJd9+Z5I6q+gfT7V+Y5GM5skuSvCuzT48CHgBH2DCw7u6q+uUkb66ql2X23vUt\nSf5DkrdW1Q1J9iX5wrT/N6rq41V1Y5IPdPdvVtXfT/KJ2Qnn+XaSf9rd10zveV+f5OtJbkhy5/Sw\nu5L83nQpzJsze7n7SN6Z5LVJ3j3Hpw0bkkuTAodVVQ/r7m9PYf7TJLu7+9NHeR/PT7Kzu194TCYJ\nG4gjbOBILqyq0zN773vP/Yj17yR5VpJnH4vJwUbjCBsABuCkMwAYgGADwAAEGwAGINgAMADBBoAB\n/H/++ODwA5LRyQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmxodxIbz-iq", + "colab_type": "text" + }, + "source": [ + "## 6.1. Le cas du stemming" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ywiY6StK0QIL", + "colab_type": "text" + }, + "source": [ + "On créer le column category_id pour assigner chaque catégorie un ID, et aussi supprimer les messages dupliqués" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nro-kS8gpaER", + "colab_type": "code", + "colab": {} + }, + "source": [ + "spam['category_id'] = spam.Category.factorize()[0]\n", + "category_id_df = spam[['Category', 'category_id']].drop_duplicates().sort_values('category_id')\n", + "category_to_id = dict(category_id_df.values)\n", + "id_to_category = dict(category_id_df[['category_id', 'Category']].values)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GtFcat500quA", + "colab_type": "text" + }, + "source": [ + "Création de dataframe spécifique au message stémmer" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "wXK2rIx_paEn", + "colab_type": "code", + "colab": {} + }, + "source": [ + "col = ['Category', 'Message_stemmed_without_StopWords','category_id']\n", + "df_stem = spam[col]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jgBG8oKKpaEs", + "colab_type": "code", + "outputId": "be9b9fbe-ce97-4b42-fced-ff5e29a73029", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 212 + } + }, + "source": [ + "df_stem.head()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMessage_stemmed_without_StopWordscategory_id
0hamgo jurong point , crazy.. avail bugi n great world la e buffet ... cine got amor wat ...0
1hamok lar ... joke wif u oni ...0
2spamfree entri 2 wkli comp win fa cup final tkts 21st may 2005 . text fa 87121 receiv entri question ( std txt rate ) t & c 's appli 08452810075over18 's1
3hamu dun say earli hor ... u c alreadi say ...0
4hamnah i n't think goe usf , live around though0
\n", + "
" + ], + "text/plain": [ + " Category ... category_id\n", + "0 ham ... 0 \n", + "1 ham ... 0 \n", + "2 spam ... 1 \n", + "3 ham ... 0 \n", + "4 ham ... 0 \n", + "\n", + "[5 rows x 3 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 20 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZDmA1G6Wy7So", + "colab_type": "text" + }, + "source": [ + "Maintenant, chacun des 5572 messages stemmés est représenté par 2534 fonctionnalités, représentant le score tf-idf pour différents unigrammes et bigrammes" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aOs3R6oCpaEM", + "colab_type": "code", + "outputId": "518bce1e-6708-41b2-e3bb-b5b5d73d7ff5", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "corpus_stem = spam.Message_stemmed_without_StopWords\n", + "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))\n", + "featuresStem = vectorizer.fit_transform(corpus_stem).toarray()\n", + "labels = df_stem.category_id\n", + "featuresStem.shape" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(5572, 2534)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 21 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BCDjJum102Di", + "colab_type": "text" + }, + "source": [ + "### 6.1.1. Classification à l'aide du Naive bayes" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9UmH7Jb-paE2", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(df_stem['Message_stemmed_without_StopWords'], df_stem['Category'], random_state = 0)\n", + "count_vect = CountVectorizer()\n", + "X_train_counts = count_vect.fit_transform(X_train)\n", + "tfidf_transformer = TfidfTransformer()\n", + "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)\n", + "\n", + "clf = MultinomialNB().fit(X_train_tfidf, y_train)\n", + "y_pred = clf.predict(count_vect.transform(X_test))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1izmZb2iw3Ov", + "colab_type": "code", + "outputId": "94c71c1d-2a39-4105-ebf6-f6a3cfbdc9ac", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + } + }, + "source": [ + "from sklearn import metrics\n", + "print(metrics.classification_report(y_test, y_pred, target_names=df_stem['Category'].unique()))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ham 0.98 1.00 0.99 1208\n", + " spam 0.99 0.89 0.94 185\n", + "\n", + " accuracy 0.98 1393\n", + " macro avg 0.99 0.94 0.96 1393\n", + "weighted avg 0.98 0.98 0.98 1393\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jgFWSxQ7paE6", + "colab_type": "code", + "outputId": "850cfa70-51b7-496f-8dd4-a930f3ff17ea", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "print(clf.predict(count_vect.transform([\"Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's\"])))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['spam']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5myKRjX5paE_", + "colab_type": "code", + "outputId": "2e19fb36-0ea4-44d4-fd67-025bd16eb343", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "print(clf.predict(count_vect.transform([\"Nah I n't think go usf , life around though\"])))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['ham']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tZxOPEOP34LN", + "colab_type": "text" + }, + "source": [ + "On remarque, il y a 22 messages qui a été confusioner (21 spam, 1 ham)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5_9AetF1paFG", + "colab_type": "code", + "outputId": "65985204-3f4d-47ff-9361-cd1201d9e956", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 388 + } + }, + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "conf_mat = confusion_matrix(y_test, y_pred)\n", + "fig, ax = plt.subplots(figsize=(8,6))\n", + "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", + "plt.ylabel('Actual')\n", + "plt.xlabel('Predicted')\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdkAAAFzCAYAAABywHOKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAc80lEQVR4nO3deZxcVbXo8d8iERIZA8iUIJNwkUHC\noCLqZVLmSXhX4CEi8gxX4TogKioyePHigPg+CA8JCASQSSESBASMiEEJY4CAoOYyXImBgAoqIAR6\nvT/qJFRihu6u7D6dU78vn/Op6l2nau/i07B67b3OPpGZSJKkxW+pugcgSVJTGWQlSSrEICtJUiEG\nWUmSCjHISpJUiEFWkqRChtY9gAWZ9eyjXlukJd7wtd5b9xCkxeLVV6ZHqc/u9P/3b1h1/WJj69Sg\nDbKSpC7R81rdIyjG6WJJkgoxk5Uk1St76h5BMQZZSVK9egyykiQVkQ3OZF2TlSSpEDNZSVK9nC6W\nJKmQBk8XG2QlSfVq8HWyBllJUr0anMla+CRJUiFmspKkeln4JElSGU2+TtYgK0mql5msJEmFNDiT\ntfBJkqRCzGQlSfXyOllJkgpp8HSxQVaSVK8GFz65JitJUiFmspKkejldLElSIQ2eLjbISpJqlWl1\nsSRJZTR4utjCJ0mSCjGTlSTVyzVZSZIKafB0sUFWklQvt1WUJKmQBmeyFj5JklSImawkqV4WPkmS\nVEiDp4sNspKkejU4k3VNVpKkQsxkJUn1anAma5CVJNXKGwRIklSKmawkSYU0uLrYwidJkgoxk5Uk\n1cvpYkmSCmnwdLFBVpJULzNZSZIKaXAma+GTJEmFmMlKkurldLEkSYUYZCVJKsQ1WUmS1FdmspKk\nejldLElSIQ2eLjbISpLq1eBM1jVZSVK9sqezYxEi4vyImBkRD7a1rRwRN0fE76vHEVV7RMQZETEt\nIh6IiK3a3nNYdf7vI+Kw3nw1g6wkqekuBHabp+04YGJmbghMrH4G2B3YsDrGAGdDKygDJwLvBN4B\nnDg7MC+MQVaSVK+ens6ORcjMXwJ/nqd5X2Bc9XwcsF9b+0XZMhlYKSLWBHYFbs7MP2fmX4Cb+efA\n/U9ck5Uk1aueNdnVM3NG9fwpYPXq+UjgD23nPVm1Lah9ocxkJUn1yuzoiIgxEXF32zGmb91nAlni\nq5nJSpLq1WEmm5ljgbF9fNvTEbFmZs6opoNnVu3TgbXbzhtVtU0Hdpin/ReL6sRMVpLUjSYAsyuE\nDwOuaWv/cFVlvC3wfDWtfCOwS0SMqAqedqnaFspMVpJUr8JrshFxGa0sdNWIeJJWlfDXgSsj4gjg\nCeCD1enXA3sA04AXgcMBMvPPEfGfwF3VeV/NzHmLqf6JQVaSVK/COz5l5sELeGnn+ZybwFEL+Jzz\ngfP70rdBVpJUL3d8kiRJfWUmK0mqVxa5emZQMMhKkurV4Olig6wkqV4GWUmSCmnw/WQtfJIkqRAz\nWUlSrbLHwidJkspwTVaSpEIavCZrkJUk1avB08UWPkmSVIiZrCSpXq7JSpJUiEFWkqRCGrx3sWuy\nkiQVYibbMMf/1+n88ld3svKIlfjxJd8D4LQzz+PWX93B0DcMZe2Ra3LKl45hheWXA+Dci67g6p/c\nyJClluKLn/k4737n1jz2xJMce8Kpcz7zyT/O4Oj/cyiHHviBWr6TtCDnjv02e+7xPmY+8yyjt/yn\n+29rSdHg6WIz2YbZb4/3873TT5mr7V1v35LxF3+P8Redzbprj+S8i68A4L8fe4IbJt7KNZd8j++d\nfgr/edqZvPbaa6y3ziiuGncWV407iyvPP4Nhw4ax8/bb1fF1pIW66KIr2XOvQ+oehjrVk50dg5hB\ntmG2Gb05K66w/Fxt737n1gwdOgSAt226MU/PfBaAn0+azO47b8/SSy/NqLXW4M2j1mLqw7+b672T\n776PtUeuyVprrD4wX0Dqg0m33cGf//Jc3cNQp7Kns2MQKz5dHBFvA9Zt7yszry7dr+Zv/HU3sdvO\n2wMw85k/8bbNNp7z2uqrrcrMZ56d6/wbJt7KHu/bfkDHKKnLDPJstBNFM9mIOB84HzgA2Ls69lrI\n+WMi4u6IuPu8iy4rObSudM64yxgyZAh77bJjr86fNWsWv7jtDnbZ6b2FRyZJzVQ6k902Mzfp7cmZ\nORYYCzDr2Ueb+6dNDX583c388ld3ct4ZpxIRAKz2plV46uln5pzz9MxnWe1Nq875edLku3nrRhuw\n6sojBny8krpHWvjUb7dHRK+DrMq4bfLdnH/pD/nuN05k+LBhc9p3fM+23DDxVl555RWe/ONT/M+T\nf2Tzt2405/Xrb/4Fe7x/hxpGLKmrNLjwqXQmexGtQPsU8DIQQGbm2wr327U+d+LXuWvKAzz33F/Z\neb8P8YkjDuW8i6/glVmz+Ninvwy0ip9O/Px/8Jb112HXnd7LPoccydAhQ/jyMZ9gyJBWgdSLL/2D\n2++awomf/2SdX0daqEsuPovt//VdrLrqyjz+6N2c/NXTuODCy+selvpqkBcvdSKy4E4bETENOAaY\nCsz5t5iZTyzqvU4XqwmGr+V6tprh1VemR6nPfuGUD3X0//tlj7+k2Ng6VTqTfSYzJxTuQ5K0JBvk\nU76dKB1kp0TEpcC1tKaLAS/hkSS1aXDhU+kgO5xWcN2lrS0Bg6wkqcVMtn8y8/CSny9JaoAGFz4V\nDbIRMQw4AtgUmHPtSGZ+tGS/kiQNBqWvk70YWAPYFbgVGAX8rXCfkqQlidfJ9ttbMvPfImLfzBxX\nFUFNKtynJGkJ0uQdn0oH2VnV43MRsRnwFLBa4T4lSUuSQZ6NdqJ0kB0bESOA44EJwHLAVwr3KUla\nkhhk++1iWnfgWRcYV7V5Y1JJUlcoHWSvAZ4H7qFtMwpJkubwEp5+G5WZuxXuQ5K0JHO6uN9+HRGb\nZ+bUwv1IkpZQaZDtm4iYSmv7xKHA4RHxKN7qTpLUZUplsnsV+lxJUtOYyfZNb+4XK0kS4F14JEkq\nxkxWkqRCGhxkS98gQJKkrmUmK0mqVWZzM1mDrCSpXg2eLjbISpLqZZCVJKmMJu/4ZOGTJEmFmMlK\nkurV4EzWICtJqldzN3wyyEqS6uWarCRJ6jMzWUlSvRqcyRpkJUn1ck1WkqQymrwma5CVJNWrwZms\nhU+SJBViJitJqlWTp4vNZCVJ9erp8FiEiPiXiLiv7fhrRHw6Ik6KiOlt7Xu0veeLETEtIn4bEbv2\n96uZyUqSapWF12Qz87fAaICIGAJMB8YDhwPfyczT2s+PiE2Ag4BNgbWAn0XERpn5Wl/7NpOVJNWr\ncCY7j52B/87MJxZyzr7A5Zn5cmY+BkwD3tHnnjDISpKWcBExJiLubjvGLOT0g4DL2n4+OiIeiIjz\nI2JE1TYS+EPbOU9WbX1mkJUk1Sp7Ojwyx2bmNm3H2Pn1ExFLA/sAP6yazgY2oDWVPAP49uL+bq7J\nSpLqNXDXye4O3JuZTwPMfgSIiHOBn1Q/TgfWbnvfqKqtz8xkJUm16jST7YODaZsqjog12177APBg\n9XwCcFBELBMR6wEbAnf257uZyUqSGi8ilgXeDxzZ1vzNiBgNJPD47Ncy86GIuBL4DfAqcFR/KovB\nICtJqlnpS3gAMvMFYJV52g5dyPlfA77Wab8GWUlSrQYiyNbFICtJqldG3SMoxiArSapVkzNZq4sl\nSSrETFaSVKvscbpYkqQimjxdbJCVJNUqLXySJKmMJmeyFj5JklSImawkqVYWPkmSVEhm3SMoxyAr\nSapVkzNZ12QlSSrETFaSVKsmZ7IGWUlSrVyTlSSpEDNZSZIKafKOTxY+SZJUiJmsJKlWTd5W0SAr\nSapVT4Oniw2ykqRaNXlNdoFBNiKuBRZYWJ2Z+xQZkSSpq3RrdfFpAzYKSZIaaIFBNjNvHciBSJK6\nU1dvRhERGwKnApsAw2a3Z+b6BcclSeoS3TpdPNsFwInAd4AdgcPx+lpJ0mLS5Ori3gTL4Zk5EYjM\nfCIzTwL2LDssSZKWfL3JZF+OiKWA30fE0cB0YLmyw5IkdYsmX8LTm0z2U8AbgU8CWwOHAoeVHJQk\nqXtkdnYMZovMZDPzrurp32mtx0qStNg0eU22N9XFtzCfTSkyc6ciI5IkdZUmTxf3Zk322Lbnw4AD\ngFfLDEeSpObozXTxPfM0/Soi7iw0HklSlxns66qd6M108cptPy5Fq/hpxWIjqiw/aofSXUjFjV7F\nPVukRenqNVngHlprskFrmvgx4IiSg5IkdY9uX5N9a2b+o70hIpYpNB5JUpdpcibbm+tkfz2fttsX\n90AkSWqahd1Pdg1gJDA8IrakNV0MsAKtzSkkSepYg+ueFjpdvCvwEWAU8G1eD7J/Bb5UdliSpG7R\n5Onihd1PdhwwLiIOyMyrBnBMkqQu0uTCp96syW4dESvN/iEiRkTEKQXHJElSI/QmyO6emc/N/iEz\n/wLsUW5IkqRu0tPhMZj15hKeIRGxTGa+DBARwwEv4ZEkLRZJc6eLexNkfwBMjIgLaBU/fQQYV3JQ\nkqTu0dPg8uLe7F38jYi4H3gfrUrrG4F1Sg9MktQdehqcyfZmTRbgaVoB9t+AnYCHi41IkqSGWNhm\nFBsBB1fHs8AVQGTmjgM0NklSF+jWNdlHgEnAXpk5DSAiPjMgo5IkdY3BXiHciYVNF+8PzABuiYhz\nI2JnaPCfG5KkWiTR0TGYLTDIZuaPM/MgYGPgFuDTwGoRcXZE7DJQA5QkaUm1yMKnzHwhMy/NzL1p\n7WM8BfhC8ZFJkrpCt29GMUe129PY6pAkqWODPVB2ok9BVpKkxW2wr6t2wiArSapVT3NjbK83o5Ak\nSX1kJitJqlWTt1U0yEqSatXg+wMYZCVJ9WpydbFrspKkWvVEdHT0RkQ8HhFTI+K+iLi7als5Im6O\niN9XjyOq9oiIMyJiWkQ8EBFb9fe7GWQlSd1ix8wcnZnbVD8fB0zMzA2BidXPALsDG1bHGODs/nZo\nkJUk1So7PDqwLzCuej4O2K+t/aJsmQysFBFr9qcDg6wkqVYDtK1iAjdFxD0RMaZqWz0zZ1TPnwJW\nr56PBP7Q9t4nq7Y+s/BJklSrTjejqILmmLamsZk57/a/78nM6RGxGnBzRDzS/mJmZkQs9kJng6wk\naYlWBdSF7qmfmdOrx5kRMR54B/B0RKyZmTOq6eCZ1enTgbXb3j6qauszp4slSbXqITo6FiUilo2I\n5Wc/B3YBHgQmAIdVpx0GXFM9nwB8uKoy3hZ4vm1auU/MZCVJtRqAzShWB8ZH63KfocClmfnTiLgL\nuDIijgCeAD5YnX89sAcwDXgROLy/HRtkJUm1Kn2DgMx8FNhiPu1/AnaeT3sCRy2Ovg2ykqRaueOT\nJEnqMzNZSVKtvEGAJEmFNPmm7QZZSVKtmrwma5CVJNWqyUHWwidJkgoxk5Uk1Spdk5UkqYwmTxcb\nZCVJtWpykHVNVpKkQsxkJUm1cjMKSZIKcTMKSZIKafKarEFWklSrJgdZC58kSSrETFaSVCsLnyRJ\nKsTCJ0mSCmnymqxBVpJUqyZPF1v4JElSIWaykqRa9TQ4lzXISpJq5ZqsJEmFNDePdU1WkqRizGQl\nSbVyuliSpELcjEKSpEKsLpYkqZDmhlgLnyRJKsZMVpJUKwufJEkqxDVZSZIKaW6INchKkmrW5Oli\nC58kSSrETFaSVCvXZCVJKqS5IdYgK0mqmWuykiSpz8xkJUm1ygZPGBtkJUm1avJ0sUFWklQrq4sl\nSSqkuSHWwidJkooxyHaJUaPW5MYbL2fKlInce+/POOqojwKw//57cu+9P+PFFx9nq63eVvMopfk7\n4fTjuGnqBK64Zdxc7Qd+9AB+NOkSrvjFRXzy+I/P9drqI1fjl9Nu5EP/ftBADlX90EN2dAxmThd3\niVdffY0vfOEU7rvvQZZbblluv/06Jk6cxEMP/ZYDDxzDWWedWvcQpQW69sobuOKCq/nqGV+e07b1\ndlvyr7u+h4N3PpxZr8xixCorzfWeY076D3798zsGeqjqBwuftMR76qmZPPXUTAD+/vcXeOSRaYwc\nuQYTJ06qeWTSok2ZfD9rjlpjrrb/ddh+jDvzEma9MguAv/zpuTmvbb/be5n+PzP4x4svDeg41T9N\nvoSn6HRxRAyJiH0i4pMRcczso2SfWrR11hnF6NGbcuedU+oeitRvb15/bUa/cwsuvO4czrn6u2yy\nxcYADH/jcA476n9z7rcvqHmE6q2eDo/BrHQmey3wD2Aqvfh3ERFjgDEAQ4eOYMiQ5cqOrgstu+wb\nueyyczj22JP529/+XvdwpH4bOnQIK660Ah/Z80g2Hf1WTh17Mvu+80DGHHs4l469kpfMYjUIlA6y\nozKz19U0mTkWGAswbNibmzt/UJOhQ4dy+eXncPnl47nmmp/WPRypI0/PeIafX38rAA/d9zDZk6y0\nykpsttUm7LzXDnzyKx9n+RWWo6cneeXlV7jygqtrHrEWpMnTxaWD7A0RsUtm3lS4H/XCOed8i0ce\nmcYZZ5xX91Ckjt3600ls8+6tuOfXU3jz+msz9A1Dee5Pz/Gx/Y6ec86Yzx7Oiy+8ZIAd5Ab7lG8n\nSgfZycD4iFgKmAUEkJm5QuF+NY/ttns7hxxyAFOnPswdd9wAwAknfJNlllma00//Km9608qMH38B\nDzzwG/be+9CaRyvN7Wv/70S23m5LVlp5Ra675yrGnnY+11x2HSd854tcccs4Zs16lZM+9V91D1P9\n1JPNzWQjC365iHgM2BeYmn3syOliNcFmI9apewjSYnH3jElR6rMPXWf/jv5/f/ETVxcbW6dKZ7J/\nAB7sa4CVJHWPJgeI0kH2UeAXEXED8PLsxsw8vXC/kqQlxGDftakTpYPsY9WxdHVIkjQXq4v7KTNP\nLvn5kqQln9XF/RQRbwI+D2wKDJvdnpk7lexXkqTBoPRdeH4APAKsB5wMPA7cVbhPSdISpMl34Skd\nZFfJzO8DszLz1sz8KGAWK0maIzv8Z1EiYu2IuCUifhMRD0XEp6r2kyJiekTcVx17tL3nixExLSJ+\nGxG79ve7lS58mlU9zoiIPYE/AisX7lOStAQZgDXZV4HPZua9EbE8cE9E3Fy99p3MPK395IjYBDiI\n1lLnWsDPImKjzHytrx2XDrKnRMSKwGeB7wIrAJ8p3KckaQlSeiuFzJwBzKie/y0iHgZGLuQt+wKX\nZ+bLwGMRMQ14B3B7X/suOl2cmT/JzOcz88HM3DEzt87MCSX7lCR1l4gYExF3tx1jFnLuusCWwB1V\n09ER8UBEnB8RI6q2kbQ2U5rtSRYelBeo9P1k14+IayPi2YiYGRHXRMT6JfuUJC1ZOi18ysyxmblN\n2zF2fv1ExHLAVcCnM/OvwNnABsBoWpnutxf3dytd+HQpcCWwBq157R8ClxXuU5K0BBmIm7ZHxBto\nBdgfZObVAJn5dGa+lpk9wLm0poQBpgNrt719VNXWZ6WD7Bsz8+LMfLU6LqHtellJkgagujiA7wMP\nt2/rGxFrtp32AeDB6vkE4KCIWCYi1gM2BO7sz3cbiPvJHgdcTmsP6AOB6yNiZYDM/HPh/iVJejdw\nKDA1Iu6r2r4EHBwRo2nFp8eBIwEy86GIuBL4Da3K5KP6U1kM5YPsB6vHI3n9RgtBqzQ6AddnJanL\nld5QIjNvoxV75nX9Qt7zNeBrnfZderr4C8AWmbkecAFwP3BAZq6XmQZYSRKZ2dExmJUOssdn5l8j\n4j20dno6j1Y1lyRJwMAUPtWldJCdPYe9J3BuZl6Ht7yTJLUpXfhUp9JBdnpEnMPrBU/LDECfkiQN\nCqUD3geBG4FdM/M5WvsWf65wn5KkJUiT78JT+qbtLwJXt/08Z/9ISZKg/N7FdSp9CY8kSQs12LPR\nTrg+KklSIWaykqRaDfYK4U4YZCVJtepxTVaSpDKaG2INspKkmln4JEmS+sxMVpJUqyZnsgZZSVKt\n3IxCkqRCzGQlSSqkydfJWvgkSVIhZrKSpFq5JitJUiGuyUqSVEiTM1nXZCVJKsRMVpJUK6eLJUkq\npMmX8BhkJUm18lZ3kiQV0uRM1sInSZIKMZOVJNXK6WJJkgpp8nSxQVaSVCszWUmSCmlyJmvhkyRJ\nhZjJSpJq5XSxJEmFNHm62CArSapVZk/dQyjGNVlJkgoxk5Uk1cq78EiSVEiTb9pukJUk1cpMVpKk\nQpqcyVr4JElSIWaykqRauRmFJEmFuBmFJEmFNHlN1iArSapVk6uLLXySJKkQM1lJUq2cLpYkqRCr\niyVJKqTJmaxrspIkFWImK0mqVZOriw2ykqRaNXm62CArSaqVhU+SJBXS5G0VLXySJKkQM1lJUq2c\nLpYkqRALnyRJKqTJa7IGWUlSrZqcyVr4JElSIWaykqRaNTmTNchKkmrV3BAL0eS/ILRwETEmM8fW\nPQ6pU/4ua7ByTba7jal7ANJi4u+yBiWDrCRJhRhkJUkqxCDb3VzDUlP4u6xBycInSZIKMZOVJKkQ\ng2wDRcS6EfFg3eOQpG5nkJUkqRCDbHMNiYhzI+KhiLgpIoZHxMci4q6IuD8iroqINwJExIURcXZE\nTI6IRyNih4g4PyIejogLa/4e6jIRsWxEXFf9nj4YEQdGxOMR8c2ImBoRd0bEW6pz946IOyJiSkT8\nLCJWr9pPiohxETEpIp6IiP3b3v/TiHhDvd9S3cIg21wbAmdl5qbAc8ABwNWZ+fbM3AJ4GDii7fwR\nwLuAzwATgO8AmwKbR8ToAR25ut1uwB8zc4vM3Az4adX+fGZuDpwJ/N+q7TZg28zcErgc+Hzb52wA\n7ATsA1wC3FK9/yVgz/JfQzLINtljmXlf9fweYF1gs+ov+6nAIbSC6GzXZqvUfCrwdGZOzcwe4KHq\nvdJAmQq8PyK+ERHvzcznq/bL2h7fVT0fBdxY/U5/jrl/p2/IzFnV5w3h9WA9FX+nNUAMss31ctvz\n12jdDOJC4Ojqr/mTgWHzOb9nnvf24I0kNIAy83fAVrSC4SkRccLsl9pPqx6/C5xZ/U4fyXx+p6s/\nFmfl69cr+jutAWOQ7S7LAzOq9ahD6h6MND8RsRbwYmZeAnyLVsAFOLDt8fbq+YrA9Or5YQM2SKmX\n/Guuu3wFuAN4pnpcvt7hSPO1OfCtiOgBZgEfB34EjIiIB2hlqAdX554E/DAi/gL8HFhv4IcrLZg7\nPkka9CLicWCbzHy27rFIfeF0sSRJhZjJSpJUiJmsJEmFGGQlSSrEICtJUiEGWQmIiNci4r5qr9wf\nzt7XuZ+ftUNE/KR6vk9EHLeQc1eKiE/0o4+TIuLY/o5R0sAwyEotL2Xm6Gqv3FeAf29/MVr6/N9L\nZk7IzK8v5JSVgD4HWUlLBoOs9M8mAW+p7sv724i4CHgQWDsidomI2yPi3irjXQ4gInaLiEci4l5g\n/9kfFBEfiYgzq+erR8T46u4y90fEdsDXgQ2qLPpb1Xmfq+6W9EBEnNz2WV+OiN9FxG3AvwzYvw1J\n/eaOT1KbiBgK7M7rm8lvCByWmZMjYlXgeOB9mflCRHwBOCYivgmcS+uOL9OAKxbw8WcAt2bmByJi\nCLAccBywWWaOrvrfperzHUAAEyLiX4EXgIOA0bT+u72X1o0fJA1iBlmpZXhEzL5r0STg+8BawBOZ\nOblq3xbYBPhVRAAsTWsP3Y1p3fXo9wARcQkwZj597AR8GCAzXwOej4gR85yzS3VMqX5ejlbQXR4Y\nn5kvVn1M6OjbShoQBlmp5aXZ2eRsVSB9ob0JuDkzD57nvMV5v90ATs3Mc+bp49OLsQ9JA8Q1Wan3\nJgPvjoi3AETEshGxEfAIsG5EbFCdd/AC3j+R1mb3RMSQiFgR+Btz36jhRuCjbWu9IyNiNeCXwH4R\nMTwilgf2XszfTVIBBlmplzLzGeAjwGXV3WBuBzbOzH/Qmh6+rip8mrmAj/gUsGN1g/F7gE0y80+0\npp8fjIhvZeZNwKXA7dV5PwKWz8x7aa313g/cANxV7ItKWmzcu1iSpELMZCVJKsQgK0lSIQZZSZIK\nMchKklSIQVaSpEIMspIkFWKQlSSpEIOsJEmF/H/bcy7eOnS8CQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YnZZwbEf4IIj", + "colab_type": "text" + }, + "source": [ + "### 6.1.2. Classification par Linear SVC (svm)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "gJs3Jk27paFQ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.svm import LinearSVC\n", + "\n", + "model = LinearSVC()\n", + "\n", + "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(featuresStem, labels, df_stem.index, test_size=0.33, random_state=0)\n", + "model.fit(X_train, y_train)\n", + "y_pred = model.predict(X_test)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BZRcNRos4g4u", + "colab_type": "text" + }, + "source": [ + "On remarque le f1-score du LinearSVC est améliorer comparant au Naive Bayes" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "UkErQi5lpaFX", + "colab_type": "code", + "outputId": "83f017c9-fe35-4ec1-ddd5-dfb57307507c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + } + }, + "source": [ + "from sklearn import metrics\n", + "print(metrics.classification_report(y_test, y_pred, target_names=df_stem['Category'].unique()))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ham 0.99 1.00 0.99 1597\n", + " spam 0.99 0.93 0.96 242\n", + "\n", + " accuracy 0.99 1839\n", + " macro avg 0.99 0.96 0.98 1839\n", + "weighted avg 0.99 0.99 0.99 1839\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "DOVdvCHYpaFd", + "colab_type": "code", + "outputId": "fe3710f3-98f7-4fb5-88fc-088db9cc4b88", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 388 + } + }, + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "conf_mat = confusion_matrix(y_test, y_pred)\n", + "fig, ax = plt.subplots(figsize=(8,6))\n", + "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", + "plt.ylabel('Actuel')\n", + "plt.xlabel('Prédit')\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdkAAAFzCAYAAABywHOKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAee0lEQVR4nO3debhd093A8e8vSUXMY2OIlxSdaNHB\nQ1tqqNY8t1Qp6mlQQ1s19UXRepWi6i3Vxhjz3AotqqpoEaFUxFARVDSoVmiNubm/94+zE5c3wx2y\n7r7Z9/vx7Ofss84+Z6/jOU9+97fWb68dmYkkSZr7BtTdAUmSmsogK0lSIQZZSZIKMchKklSIQVaS\npEIMspIkFTKo7g7MytQXJ3ptkeZ5Q5Zbr+4uSHNF21vPRqnP7um/9+9Z6n3F+tZTfTbISpL6ifZp\ndfegGIeLJUkqxExWklSvbK+7B8UYZCVJ9Wo3yEqSVEQ2OJN1TlaSpELMZCVJ9XK4WJKkQho8XGyQ\nlSTVq8HXyRpkJUn1anAma+GTJEmFmMlKkupl4ZMkSWU0+TpZg6wkqV5mspIkFdLgTNbCJ0lSo0XE\nuRHxQkQ81KHtmIh4NiIeqLbNO7z23YiYEBGPRcQXOrRvWrVNiIjDO3NuM1lJUr3KXyd7PnA6cMG7\n2k/NzJM7NkTEh4GdgdWA5YDfRcT7q5fPADYBJgFjI2J0Zj48uxMbZCVJ9So8XJyZt0fESp08fBvg\nssx8E3gyIiYAa1evTcjMiQARcVl17GyDrMPFkqR6tbf3aIuIERFxb4dtRCfPvH9EPFgNJy9etS0P\nPNPhmElV26zaZ8sgK0map2XmyMz8RIdtZCfediawMrAmMBk4pUTfHC6WJNWrhurizHx++n5EnAVc\nXz19Flihw6HDqjZm0z5LZrKSpHr1cLi4OyJi2Q5PtwOmVx6PBnaOiMERMRxYFbgHGAusGhHDI2I+\nWsVRo+d0HjNZSVKtMstWF0fEpcAGwFIRMQk4GtggItYEEngK2LvVlxwfEVfQKmhqA/bLqoMRsT9w\nEzAQODczx8/x3Jk517/Q3DD1xYl9s2NSFwxZbr26uyDNFW1vPRulPvuNB67v0b/386+5ZbG+9ZTD\nxZIkFeJwsSSpXq5dLElSIQ1eu9ggK0mqV/llFWtjkJUk1avBmayFT5IkFWImK0mql4VPkiQV0uDh\nYoOsJKleDc5knZOVJKkQM1lJUr0anMkaZCVJtSp9g4A6GWQlSfUyk5UkqZAGVxdb+CRJUiFmspKk\nejlcLElSIQ0eLjbISpLqZSYrSVIhDc5kLXySJKkQM1lJUr0cLpYkqRCDrCRJhTgnK0mSuspMVpJU\nL4eLJUkqpMHDxQZZSVK9zGQlSSqkwZmshU+SJBViJitJqpfDxZIkFWKQlSSpkMy6e1CMQVaSVK8G\nZ7IWPkmSVIiZrCSpXg3OZA2ykqR6Nfg6WYOsJKleDc5knZOVJKkQM1lJUr28hEeSpEIaPFxskJUk\n1csgK0lSIQ2uLrbwSZKkQsxkJUm1ynYLnyRJKsM5WUmSCmnwnKxBVpJUrwYPF1v4JElSIWaykqR6\nOScrSVIhBllJkgpp8NrFzslKklSImWzDHHn8j7n9T/ewxOKL8auLfg7AGedcxNWjb2TxxRYF4Jt7\n7876n1qbqVOncuyPfsr4Rx8nBgSHf3Mf1v7YRwHYY/9DefHFfzF48GAARv7kf1hy8cXq+VLSLAwb\nthznn3sa7x26FJnJ2WdfzE9PP6fubqmrCg8XR8S5wJbAC5m5etV2ErAV8BbwBLBnZk6pXvsusBcw\nDTgwM2+q2jcFTgMGAmdn5glzOrdBtmG23XwTdtlha/77Bye/o323nbZlz112fEfbVaNvBOCXF57J\nP1+awr7fOYrLzj6NAQNaAxwnHH0oq3/o/b3Tcakb2traOOTQY7n/gYdYaKEFuWfMjfzultt55JHH\n6+6auqL8JTznA6cDF3Rouxn4bma2RcSJwHeBwyLiw8DOwGrAcsDvImL6P4RnAJsAk4CxETE6Mx+e\n3YkdLm6YT6z5ERZdZOFOHfvEU39j7Y+vAcCSiy/GwgstyPhH/cdJ847nnnuB+x94CID//OdVHn30\ncZZfbpmae6Uuy/aebXP6+MzbgX+9q+23mdlWPb0bGFbtbwNclplvZuaTwARg7WqbkJkTM/Mt4LLq\n2NkqHmQj4qMRsXVEbD99K31O/X+XXn0d2311X448/se8/Mq/AfjAKsP5wx/vpq1tGpP+/hwPPzaB\n557/x4z3HHX8qeyw+378/LxLyAYXJqgZVlxxGGuusTpj7rm/7q6oq9qzZ1vPfQ24odpfHnimw2uT\nqrZZtc9W0SBbjYOfC+xAa+x7K1rj4rM6fkRE3BsR9559waUlu9av7LTdFtxwxblcff4ZLL3kEpx0\n+lkAbLfFFxi69FLstNeBnHjaL1hz9Q8xYGDrJ3Hi0YfyywvP5IKfncR9f3mI0TfeUudXkGZrwQUX\n4IrLz+Kgg4/m3//+T93dUS/rGDuqbUQX3nsE0AZcXKJvpedk18nMD3f24MwcCYwEmPriRFOnuWSp\nJRafsb/j1pux3yFHAzBo0EAO++beM177yt4HsdIKrT/Mhi69FND6x2uLTTbkoYf/yjabfa4Xey11\nzqBBg7jy8rO49NJf8qtf3TDnN6jPyR4WPnWMHV0REXvQSvw2zreH654FVuhw2LCqjdm0z1Lp4eK7\nqklk1egfL749FXHLbXeyyvtWBOD1N97gtdffAODOe/7MoIEDWXn4irS1TeOlKS8DMLWtjdvuHDPj\nPVJfc9bIU3jk0Qn85LQu/xurvqKG4eKqUvhQYOvMfK3DS6OBnSNicEQMB1YF7gHGAqtGxPCImI9W\ncdToOZ2ndCZ7Aa1A+xzwJhBAZuZHC5+33zrk6BMYe/+DTJnyChtvuyvf2Gs3xt7/II89PhECll9m\nKEcfeiAA/3rpZfb+9hHEgAEMXXpJfvi9gwF4a+pU9j7oSKa2tdE+rZ11PrkWO269aZ1fS5qpT3/q\nk+y26448OO5h7h37WwCOOuoEbrjx9zX3TF1S+C48EXEpsAGwVERMAo6mVU08GLg5IgDuzsx9MnN8\nRFwBPExrGHm/zJxWfc7+wE20LuE5NzPHz/HcJQtaImICcBAwDpjxfzEzn57Tex0uVhMMWW69ursg\nzRVtbz0bpT771eN27dG/9wseeVGxvvVU6Uz2H5k5x3RaktSPNfhWd6WD7P0RcQlwHa3hYgAy85rC\n55UkzSu8QUC3DaEVXD/foS0Bg6wkqcVMtnsyc8+Sny9JaoDChU91KhpkI2J+WossrwbMP709M79W\n8rySJPUFpa+TvRBYBvgCcButi3f/XfickqR5Sf3LKhZTek52lcz8YkRsk5mjqiKoOwqfU5I0D+np\nik99WekgO7V6nBIRqwPPAe8tfE5J0rykj2ejPVE6yI6MiMWBI2ktP7UQcFThc0qS5iUG2W67kNYd\neFYCRlVtQwufU5KkPqF0kL0WeBm4jw6LUUiSNIOX8HTbsMx0ZXlJ0qw5XNxtd0bERzJzXOHzSJLm\nUWmQ7ZqIGEdr+cRBwJ4RMRFvdSdJ6mdKZbJbFvpcSVLTmMl2TWfuFytJEuBdeCRJKsZMVpKkQhoc\nZEvfIECSpH7LTFaSVKvM5mayBllJUr0aPFxskJUk1csgK0lSGU1e8cnCJ0mSCjGTlSTVq8GZrEFW\nklSv5i74ZJCVJNXLOVlJktRlZrKSpHo1OJM1yEqS6uWcrCRJZTR5TtYgK0mqV4MzWQufJEkqxExW\nklQrh4slSSqlwcPFBllJUq3SICtJUiENDrIWPkmSVIiZrCSpVg4XS5JUikFWkqQympzJOicrSVIh\nZrKSpFo1OZM1yEqSamWQlSSplIy6e1CMQVaSVKsmZ7IWPkmSVIiZrCSpVtnucLEkSUU0ebjYICtJ\nqlVa+CRJUhlNzmQtfJIkqRCDrCSpVtkePdo6IyK+GREPRcT4iPhW1bZERNwcEY9Xj4tX7RER/xsR\nEyLiwYj4WHe/m0FWklSrzJ5tcxIRqwNfB9YG1gC2jIhVgMOBWzJzVeCW6jnAZsCq1TYCOLO7380g\nK0mqVS9ksh8CxmTma5nZBtwGbA9sA4yqjhkFbFvtbwNckC13A4tFxLLd+W4GWUlS0z0ErBcRS0bE\nAsDmwArA0MycXB3zHDC02l8eeKbD+ydVbV1mdbEkqVY9XYwiIkbQGtadbmRmjpzx+ZmPRMSJwG+B\nV4EHgGnv6ENmRkQnBp+7xiArSapVZ+ZVZ//+HAmMnMMx5wDnAETE8bSy0+cjYtnMnFwNB79QHf4s\nrUx3umFVW5c5XCxJqlUvVRe/t3r8L1rzsZcAo4Hdq0N2B66t9kcDX62qjNcBXu4wrNwlZrKSpFr1\n0opPV0fEksBUYL/MnBIRJwBXRMRewNPAl6pjf0Nr3nYC8BqwZ3dPapCVJDVeZq43k7Z/AhvPpD2B\n/ebGeQ2ykqRaNXlZRYOsJKlW7d4gQJKkMvrlXXgi4jpgloXVmbl1kR5JkvqV/nrT9pN7rReSJDXQ\nLINsZt42fT8ihgD/lZmP9UqvJEn9Rk8Xo+jL5rgYRURsRWsJqhur52tGxOjSHZMk9Q+9sRhFXTpT\n+HQMrdsD/QEgMx+IiOEF+yRJ6keaXF3cmWUVp2bmy+9qa3ByL0nS3NGZTHZ8ROwCDIyIVYEDgTvL\ndkuS1F80+RKezmSyBwCrAW8ClwKvAN8q2SlJUv+R2bOtL5tjJpuZrwFHVJskSXNVk+dk5xhkI+JW\nZjIHm5kbFemRJKlfafJwcWfmZA/usD8/sAPQVqY7kiQ1R2eGi+97V9OfIuKeQv2RJPUzfX1etSc6\nM1y8RIenA4CPA4sW61FlweXXL30Kqbi1llq57i5IfV6/npMF7qM1Jxu0homfBPYq2SlJUv/R3+dk\nP5SZb3RsiIjBhfojSepnmpzJduY62ZktPHHX3O6IJElNM7v7yS4DLA8MiYi1aA0XAywCLNALfZMk\n9QMNrnua7XDxF4A9gGHAKbwdZF8B/rtstyRJ/UWTh4tndz/ZUcCoiNghM6/uxT5JkvqRJhc+dWZO\n9uMRsdj0JxGxeEQcV7BPkiQ1QmeC7GaZOWX6k8x8Cdi8XJckSf1Jew+3vqwzl/AMjIjBmfkmQEQM\nAbyER5I0VyTNHS7uTJC9GLglIs6jVfy0BzCqZKckSf1He4PLizuzdvGJEfEX4HO0Kq1vAlYs3TFJ\nUv/Q3uBMtjNzsgDP0wqwXwQ2Ah4p1iNJkhpidotRvB/4crW9CFwORGZu2Et9kyT1A/11TvZR4A5g\ny8ycABAR3+6VXkmS+o2+XiHcE7MbLt4emAzcGhFnRcTG0OA/NyRJtUiiR1tfNssgm5m/ysydgQ8C\ntwLfAt4bEWdGxOd7q4OSJM2r5lj4lJmvZuYlmbkVrXWM7wcOK94zSVK/0N8Xo5ihWu1pZLVJktRj\nfT1Q9kSXgqwkSXNbX59X7QmDrCSpVu3NjbGdXoxCkiR1kZmsJKlWTV5W0SArSapVg+8PYJCVJNXL\n6mJJkgppj+YOF1v4JElSIWaykqRaOScrSVIhzslKklSIi1FIkqQuM5OVJNXKxSgkSSrEwidJkgpp\n8pysQVaSVKsmVxdb+CRJUiFmspKkWjknK0lSIU2ek3W4WJJUq/Yebp0REYtFxFUR8WhEPBIR60bE\nEhFxc0Q8Xj0uXh0bEfG/ETEhIh6MiI9197sZZCVJteqNIAucBtyYmR8E1gAeAQ4HbsnMVYFbqucA\nmwGrVtsI4MzufjeDrCSp0SJiUWB94ByAzHwrM6cA2wCjqsNGAdtW+9sAF2TL3cBiEbFsd85tkJUk\n1SqjZ1tEjIiIeztsI951iuHAP4DzIuL+iDg7IhYEhmbm5OqY54Ch1f7ywDMd3j+pausyC58kSbXq\n6XWymTkSGDmbQwYBHwMOyMwxEXEabw8NT/+MjIi5XuhsJitJqlUvzMlOAiZl5pjq+VW0gu7z04eB\nq8cXqtefBVbo8P5hVVuXGWQlSY2Wmc8Bz0TEB6qmjYGHgdHA7lXb7sC11f5o4KtVlfE6wMsdhpW7\nxOFiSVKtemkxigOAiyNiPmAisCetRPOKiNgLeBr4UnXsb4DNgQnAa9Wx3WKQlSTVqjcWo8jMB4BP\nzOSljWdybAL7zY3zGmQlSbVq8g0CDLKSpFo1Ocha+CRJUiFmspKkWnkXHkmSCmnyXXgMspKkWjV5\nTtYgK0mqVZOHiy18kiSpEDNZSVKt2hucyxpkJUm1ck5WkqRCmpvHOicrSVIxZrKSpFo5XCxJUiEu\nRiFJUiFWF0uSVEhzQ6yFT5IkFWMmK0mqlYVPkiQV4pysJEmFNDfEGmQlSTVr8nCxhU+SJBViJitJ\nqpVzspIkFdLcEGuQlSTVzDlZSZLUZWaykqRaZYMHjA2ykqRaNXm42CArSaqV1cWSJBXS3BBr4ZMk\nScUYZPuJkb84mUnPPMD9f/7djLY1Pvph7rh9NGPvuYm77vw1n/jEmjX2UJq5ocu9lzOv/AmX/+EC\nLr91FDvvtSMABx61L1fefiGX/O48fnTOcSy0yELvfN/y7+W2x29k1312rqPb6oJ2skdbX2aQ7Scu\nuPBKttxq13e0Hf/DIzjuf07lk2t/gWO/fwo/PP6ImnonzVpb2zR+8v2fsdMGX2XPLfdhxz22Y/iq\nKzLm9nvZecM92OVze/K3iZPY44B3/r6/ffT+3Pn7MTX1Wl3R3sOtLzPI9hN//OMYXnppyjvaMpNF\nFm799b/oIgszefLzdXRNmq1/vvBPHhv3VwBee/V1nprwNEsvuzRjbhvLtGnTAHjovvEMXXbpGe/5\n7Kaf4e/PTGbiX5+qo8vqouzhf31Z0cKniBgIbAGs1PFcmfnjkudV5xx88DFcf93FnHDCUQwYMIDP\nbrBN3V2SZmvZYcvwgdVXZfyfH35H+9Zf3pybr/09AEMWGMJXv7EL++/8HXbd16HieUFfz0Z7onQm\nex2wB7AksHCHbaYiYkRE3BsR97ZPe7Vw1zRixFc55JBjWXmVtTnkkGP4xS9OrrtL0iwNWWAIJ579\nA378vZ/y6n9em9G+54G70dY2jRuuuRmAEQfvyaVnXcnrr71eV1elGUpfwjMsMz/a2YMzcyQwEmC+\nwcP69hhAA+y2644cdND3ALjq6uv5+c9PqrlH0swNHDSQE8/+ATdeczO33nD7jPYtv7Qpn/ncunxj\np2/PaFttrQ+x0Raf5YAj92HhRRaivT158823uPK8a+roujqhrw/59kTpIHtDRHw+M39b+DzqhsmT\nn2f99dfl9tvvYsMNP82ECU/W3SVppo465TCeevxpLhl5xYy2dTdYm92+sQt7b38Ab77+5oz2Edsd\nMGP/69/Zk9dffd0A28c1ebi4dJC9G/hlRAwApgIBZGYuUvi8epcLLzid9ddfl6WWWoKJT4zl+z84\nhX32PZQfn3IsgwYN4o033mTfbxxWdzel/2eNtT/CFl/clMcffoKLbz4HgDN+eBYH/+BA5hs8H2dc\n3irxGHffw5xw+Cl1dlXd1J7NzWQjC365iHgS2AYYl108kcPFaoI1lnxf3V2Q5oqxf789Sn32bitu\n36N/7y98+ppifeup0pnsM8BDXQ2wkqT+o8kBonSQnQj8ISJuAGZMmngJjyRpur6+alNPlA6yT1bb\nfNUmSdI7WF3cTZl5bMnPlyTN+6wu7qaIWBo4FFgNmH96e2ZuVPK8kiT1BaVXfLoYeBQYDhwLPAWM\nLXxOSdI8xLvwdN+SmXkOMDUzb8vMrwFmsZKkGbxBQPdNrR4nR8QWwN+BJQqfU5I0D3FOtvuOi4hF\nge8APwUWAb49+7dIkvqTJi+lULq6+Ppq92Vgw5LnkiSpryk6JxsR74uI6yLixYh4ISKujQjXmZMk\nzWDhU/ddAlwBLAMsB1wJXFr4nJKkeUh7D7e+rHSQXSAzL8zMtmq7iA7Xy0qSVLq6OCLmj4h7IuIv\nETE+Io6t2odHxJiImBARl0fEfFX74Or5hOr1lbr73UoH2Rsi4vCIWCkiVoyIQ4HfRMQSEWGVsSSp\nN7wJbJSZawBrAptGxDrAicCpmbkK8BKwV3X8XsBLVfup1XHdUrq6+EvV4968faOFAHaunjs/K0n9\nXOl51epOcP+pnr6n2pLWug27VO2jgGOAM2ndovWYqv0q4PSIiO7cUa50JnsYsEZmDgfOA/4C7JCZ\nwzPTACtJIjN7tHVGRAyMiAeAF4CbgSeAKZnZVh0yCVi+2l+e1q1aqV5/GViyO9+tdJA9MjNfiYjP\n0PqL4WxafyVIkgT0vPApIkZExL0dthHvPkdmTsvMNYFhwNrAB4t/McoPF0+rHrcAzsrMX0fEcYXP\nKUmah/R0acTMHAmM7OSxUyLiVmBdYLGIGFRlq8OAZ6vDngVWACZFxCBgUeCf3elb6Uz22Yj4BbAT\nrYKnwb1wTkmSZoiIpSNisWp/CLAJ8AhwK7BjddjuwLXV/ujqOdXrv+/OfCz0TuHTpsDJ1V8PywKH\nFD6nJGke0gsLSiwLjIqIgbQSvSsy8/qIeBi4rBphvR84pzr+HODCiJgA/ItWsW63lF5W8TXgmg7P\nJwOTS55TkjRvKb12cWY+CKw1k/aJtOZn393+BvDFuXHu0pmsJEmz1deXRuwJ50clSSrETFaSVKu+\nfuP1njDISpJq1e79ZCVJKqO5IdYgK0mqmYVPkiSpy8xkJUm1anIma5CVJNWq9GIUdTLISpJqZSYr\nSVIhTb5O1sInSZIKMZOVJNXKOVlJkgpxTlaSpEKanMk6JytJUiFmspKkWjlcLElSIU2+hMcgK0mq\nlbe6kySpkCZnshY+SZJUiJmsJKlWDhdLklRIk4eLDbKSpFqZyUqSVEiTM1kLnyRJKsRMVpJUK4eL\nJUkqpMnDxQZZSVKtMtvr7kIxzslKklSImawkqVbehUeSpEKafNN2g6wkqVZmspIkFdLkTNbCJ0mS\nCjGTlSTVysUoJEkqxMUoJEkqpMlzsgZZSVKtmlxdbOGTJEmFmMlKkmrlcLEkSYVYXSxJUiFNzmSd\nk5UkqRAzWUlSrZpcXWyQlSTVqsnDxQZZSVKtLHySJKmQJi+raOGTJEmFmMlKkmrlcLEkSYVY+CRJ\nUiFNnpM1yEqSatXkTNbCJ0mSCjGTlSTVqsmZrEFWklSr5oZYiCb/BaHZi4gRmTmy7n5IPeVvWX2V\nc7L924i6OyDNJf6W1ScZZCVJKsQgK0lSIQbZ/s05LDWFv2X1SRY+SZJUiJmsJEmFGGQbKCJWioiH\n6u6HJPV3BllJkgoxyDbXwIg4KyLGR8RvI2JIRHw9IsZGxF8i4uqIWAAgIs6PiDMj4u6ImBgRG0TE\nuRHxSEScX/P3UD8TEQtGxK+r3+lDEbFTRDwVET+KiHERcU9ErFIdu1VEjImI+yPidxExtGo/JiJG\nRcQdEfF0RGzf4f03RsR76v2W6i8Mss21KnBGZq4GTAF2AK7JzE9m5hrAI8BeHY5fHFgX+DYwGjgV\nWA34SESs2as9V3+3KfD3zFwjM1cHbqzaX87MjwCnAz+p2v4IrJOZawGXAYd2+JyVgY2ArYGLgFur\n978ObFH+a0gG2SZ7MjMfqPbvA1YCVq/+sh8HfIVWEJ3uumyVmo8Dns/McZnZDoyv3iv1lnHAJhFx\nYkSsl5kvV+2Xdnhct9ofBtxU/aYP4Z2/6Rsyc2r1eQN5O1iPw9+0eolBtrne7LA/jdbNIM4H9q/+\nmj8WmH8mx7e/673teCMJ9aLM/CvwMVrB8LiI+N70lzoeVj3+FDi9+k3vzUx+09Ufi1Pz7esV/U2r\n1xhk+5eFgcnVfNRX6u6MNDMRsRzwWmZeBJxEK+AC7NTh8a5qf1Hg2Wp/917rpNRJ/jXXvxwFjAH+\nUT0uXG93pJn6CHBSRLQDU4F9gauAxSPiQVoZ6perY48BroyIl4DfA8N7v7vSrLnik6Q+LyKeAj6R\nmS/W3RepKxwuliSpEDNZSZIKMZOVJKkQg6wkSYUYZKVCImK1iNi67n5Iqo9BVuqkiJgWEQ9U6+le\nOX3t51kc+1/AEcAfZvH6BhFxfbW/dUQcXu1vGxEfLtB9STUwyEqd93pmrlmtp/sWsE/HF6NlAEBm\n/i0zd8nMV+b0oZk5OjNPqJ5uCxhkpYYwyErdcwewSnXv3sci4gLgIWCFiPh8RNwVEX+uMt6FACJi\n04h4NCL+DGw//YMiYo+IOD0iPkVrMfuTqox55Tq+mKS5xyArdVFEDAI2o7W2LrTuePSz6o5HrwJH\nAp/LzI8B9wIHRcT8wFnAVsDHgWXe/bmZeSetOyAdUmXMTxT/MpKKcllFqfOGRMT0OxvdAZwDLAc8\nnZl3V+3r0Bru/VNEAMxHa53dD9K6M9LjABFxETCiF/suqQYGWanzXs/Md9xbtwqkr3ZsAm7OzC+/\n6zjvySv1Qw4XS3PX3cCnI2IVgIhYMCLeDzwKrNRhnvXLs3j/v/HGDVJjGGSluSgz/wHsAVxa3THm\nLuCDmfkGreHhX1eFTy/M4iMuAw6JiPstfJLmfa5dLElSIWaykiQVYpCVJKkQg6wkSYUYZCVJKsQg\nK0lSIQZZSZIKMchKklSIQVaSpEL+DxIeuChB+c36AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HGE0AjFI5KJ2", + "colab_type": "text" + }, + "source": [ + "On remarque un petit peu d'amélioration, alors SVM est un bon choix pour ce cas de dataSet " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8iz2DK4OpaFh", + "colab_type": "code", + "outputId": "037567ac-bdfd-46f7-a4d0-2d945a09b883", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 615 + } + }, + "source": [ + "from IPython.display import display\n", + "\n", + "for predicted in category_id_df.category_id:\n", + " for actual in category_id_df.category_id:\n", + " if predicted != actual and conf_mat[actual, predicted] >= 6:\n", + " print(\"'{}' est prédit comme '{}' : {} exemples.\".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))\n", + " display(df_stem.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['Category', 'Message_stemmed_without_StopWords']])\n", + " print('')" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "'spam' est prédit comme 'ham' : 18 exemples.\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMessage_stemmed_without_StopWords
684spamhi i 'm sue . i 20 year old work lapdanc . i love sex . text live - i 'm bedroom . text sue 89555 . by textoper g2 1da 150ppmsg 18+
731spamemail alertfrom : jeri stewarts : 2kbsubject : low-cost prescripiton drvgsto listen email call 123
1940spammore peopl dog area . call 09090204448 join like mind guy . whi arrang 1 . there 's 1 even . a£1.50 minapn ls278bb
751spamdo realiz 40 year , ll thousand old ladi run around tattoo ?
4213spammiss call alert . these number call left messag . 07008009200
4298spamthesmszone.com let send free anonym mask messages..im send messag there..do see potenti abus ? ? ?
5466spamhttp//tms . widelive.com/index . wml ? id=820554ad0a1705572711 & first=true¡c c ringtone¡
333spamcall germani 1 penc per minut ! call fix line via access number 0844 861 85 85 . no prepay . direct access !
3864spamoh god ! i ve found number ! i 'm glad , text back xafter msgs cst std ntwk chg £1.50
5449spamlatest news ! polic station toilet stolen , cop noth go !
1430spamfor sale - arsenal dartboard . good condit doubl trebl !
2247spamhi ya babe x u 4goten bout ? ' scammer get smart..though regular vodafon , respond get prem rate msg/subscript . other nos use also . bewar !
3991spam( bank granit issu strong-buy ) explos pick for our member *****up over 300 % *********** nasdaq symbol cdgt that $ 5.00 per..
3460spamnot heard u4 . call night knicker . make beg like u last time 01223585236 xx luv nikiyu4.net
2823spamromcapspam everyon around respond well presenc sinc warm outgo . you bring real breath sunshin .
2402spambabe : u want dont u babi ! im nasti thing 4 filthyguy . fanci rude time sexi bitch . how go slo n hard ! txt xxx slo ( 4msgs )
3501spamdorothi @ kiefer.com ( bank granit issu strong-buy ) explos pick for our member *****up over 300 % *********** nasdaq symbol cdgt that $ 5.00 per..
4821spamcheck out choos your babe video @ sms.shsex.netun fgkslpopw fgkslpo
\n", + "
" + ], + "text/plain": [ + " Category Message_stemmed_without_StopWords\n", + "684 spam hi i 'm sue . i 20 year old work lapdanc . i love sex . text live - i 'm bedroom . text sue 89555 . by textoper g2 1da 150ppmsg 18+ \n", + "731 spam email alertfrom : jeri stewarts : 2kbsubject : low-cost prescripiton drvgsto listen email call 123 \n", + "1940 spam more peopl dog area . call 09090204448 join like mind guy . whi arrang 1 . there 's 1 even . a£1.50 minapn ls278bb \n", + "751 spam do realiz 40 year , ll thousand old ladi run around tattoo ? \n", + "4213 spam miss call alert . these number call left messag . 07008009200 \n", + "4298 spam thesmszone.com let send free anonym mask messages..im send messag there..do see potenti abus ? ? ? \n", + "5466 spam http//tms . widelive.com/index . wml ? id=820554ad0a1705572711 & first=true¡c c ringtone¡ \n", + "333 spam call germani 1 penc per minut ! call fix line via access number 0844 861 85 85 . no prepay . direct access ! \n", + "3864 spam oh god ! i ve found number ! i 'm glad , text back xafter msgs cst std ntwk chg £1.50 \n", + "5449 spam latest news ! polic station toilet stolen , cop noth go ! \n", + "1430 spam for sale - arsenal dartboard . good condit doubl trebl ! \n", + "2247 spam hi ya babe x u 4goten bout ? ' scammer get smart..though regular vodafon , respond get prem rate msg/subscript . other nos use also . bewar ! \n", + "3991 spam ( bank granit issu strong-buy ) explos pick for our member *****up over 300 % *********** nasdaq symbol cdgt that $ 5.00 per.. \n", + "3460 spam not heard u4 . call night knicker . make beg like u last time 01223585236 xx luv nikiyu4.net \n", + "2823 spam romcapspam everyon around respond well presenc sinc warm outgo . you bring real breath sunshin . \n", + "2402 spam babe : u want dont u babi ! im nasti thing 4 filthyguy . fanci rude time sexi bitch . how go slo n hard ! txt xxx slo ( 4msgs ) \n", + "3501 spam dorothi @ kiefer.com ( bank granit issu strong-buy ) explos pick for our member *****up over 300 % *********** nasdaq symbol cdgt that $ 5.00 per..\n", + "4821 spam check out choos your babe video @ sms.shsex.netun fgkslpopw fgkslpo " + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_J_FXRewpaFk", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6EnkB1U75g_q" + }, + "source": [ + "\n", + "## 6.2. Le cas du Lemmatization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qaFeUq4x5g_v" + }, + "source": [ + "Création de dataframe spécifique au message Lemmatizé" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "eo0mCQlF5g_y", + "colab": {} + }, + "source": [ + "col = ['Category', 'Message_lemmatized_before_stemming','category_id']\n", + "df_lem = spam[col]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "20be06a6-f249-4a26-8697-a51975167dc4", + "id": "O1FDhanu5g_1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 212 + } + }, + "source": [ + "df_lem.head()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMessage_lemmatized_before_stemmingcategory_id
0hamGo jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ...0
1hamOk lar ... Joking wif u oni ...0
2spamFree entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's1
3hamU dun say early hor ... U c already say ...0
4hamNah I n't think go usf , life around though0
\n", + "
" + ], + "text/plain": [ + " Category ... category_id\n", + "0 ham ... 0 \n", + "1 ham ... 0 \n", + "2 spam ... 1 \n", + "3 ham ... 0 \n", + "4 ham ... 0 \n", + "\n", + "[5 rows x 3 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 34 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ptWn7yRg5g_5" + }, + "source": [ + "Maintenant, chacun des 5572 messages Lemmatizés est représenté par 2534 fonctionnalités, représentant le score tf-idf pour différents unigrammes et bigrammes" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "46a61288-8350-44ae-a200-99c5255385c0", + "id": "VMj1vFpJ5g_6", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "corpus_stem = df_lem.Message_lemmatized_before_stemming\n", + "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))\n", + "featuresLem = vectorizer.fit_transform(corpus_stem).toarray()\n", + "labels = df_lem.category_id\n", + "featuresLem.shape" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(5572, 2566)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 36 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "m3AcMEM55g__" + }, + "source": [ + "#### 6.2.1. Classification à l'aide du Naive bayes" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "GDVyKOtE5hAA", + "colab": {} + }, + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(df_lem['Message_lemmatized_before_stemming'], df_lem['Category'], random_state = 0)\n", + "count_vect = CountVectorizer()\n", + "X_train_counts = count_vect.fit_transform(X_train)\n", + "tfidf_transformer = TfidfTransformer()\n", + "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)\n", + "\n", + "clf = MultinomialNB().fit(X_train_tfidf, y_train)\n", + "y_pred = clf.predict(count_vect.transform(X_test))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ubAOz_Fn71H7", + "colab_type": "text" + }, + "source": [ + "#####6.2.1.1. F1-Score" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "d600749a-cca9-4fe6-a007-127243502c1d", + "id": "Farl3n865hAC", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + } + }, + "source": [ + "from sklearn import metrics\n", + "print(metrics.classification_report(y_test, y_pred, target_names=df_lem['Category'].unique()))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ham 0.98 1.00 0.99 1208\n", + " spam 1.00 0.88 0.94 185\n", + "\n", + " accuracy 0.98 1393\n", + " macro avg 0.99 0.94 0.96 1393\n", + "weighted avg 0.98 0.98 0.98 1393\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "97338842-307d-4c9f-c05c-463e8fddab23", + "id": "Q0P9HCGM5hAF", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "print(clf.predict(count_vect.transform([\"Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's\"])))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['spam']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "0cc5c06a-b266-4fa0-b097-8dcdd330b4b3", + "id": "ptPFEuio5hAI", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "print(clf.predict(count_vect.transform([\"Nah I n't think go usf , life around though\"])))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['ham']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HzNSltF75hAL" + }, + "source": [ + "On remarque, il y a 22 messages qui a été confusioner (22 ham comme spam, 0 spam comme ham)" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "2ce176b5-dcbb-4bb8-f586-eb597c56d629", + "id": "xBaEUblc5hAN", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 388 + } + }, + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "conf_mat = confusion_matrix(y_test, y_pred)\n", + "fig, ax = plt.subplots(figsize=(8,6))\n", + "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", + "plt.ylabel('Actuel')\n", + "plt.xlabel('Prédit')\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdkAAAFzCAYAAABywHOKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAeOklEQVR4nO3deZRdVZX48e9OhQREhIBCIImEyR+C\nQBBERF2CKBiVQVGGVgRkGX+20iLtgEqj2Djb+GtaGw10NExhUBGQSUBEWpkCBMKokUEIYVIBZchQ\ntX9/vJvwiBmq8nLqVu77flh31Xun7nvnPNZbtbPP2ffcyEwkSdLKN6zuAUiS1FQGWUmSCjHISpJU\niEFWkqRCDLKSJBVikJUkqZDhdQ9gaeY/ca/XFmmVt8ZGb657CNJKsWDe7Cj13p3+vV/t5ZsWG1un\nhmyQlSR1ib7eukdQjNPFkiQVYiYrSapX9tU9gmIMspKkevUZZCVJKiIbnMm6JitJUiFmspKkejld\nLElSIQ2eLjbISpLq1eDrZA2ykqR6NTiTtfBJkqRCzGQlSfWy8EmSpDKafJ2sQVaSVC8zWUmSCmlw\nJmvhkyRJhZjJSpLq5XWykiQV0uDpYoOsJKleDS58ck1WkqRCzGQlSfVyuliSpEIaPF1skJUk1SrT\n6mJJkspo8HSxhU+SJBViJitJqpdrspIkFdLg6WKDrCSpXm6rKElSIQ3OZC18kiSpEDNZSVK9LHyS\nJKmQBk8XG2QlSfVqcCbrmqwkSYWYyUqS6tXgTNYgK0mqlTcIkCSpFDNZSZIKaXB1sYVPkiQVYiYr\nSaqX08WSJBXS4Olig6wkqV5mspIkFdLgTNbCJ0mSCjGTlSTVy+liSZIKMchKklSIa7KSJGmgzGQl\nSfVyuliSpEIaPF1skJUk1avBmaxrspKkemVfZ8dyRMSUiHgsIm5va1s3Ii6PiD9UP0dV7RERJ0bE\nrIi4LSJe2/aaQ6rz/xARh/TnoxlkJUlN92PgHYu1HQ1cmZlbAFdWzwEmAltUxyTgJGgFZeBLwOuB\nnYAvLQzMy2KQlSTVq6+vs2M5MvM3wF8Wa94HmFo9ngrs29Z+arZcB6wTERsCewKXZ+ZfMvOvwOX8\nY+D+B67JSpLqVc+a7AaZOad6/AiwQfV4DPBg23kPVW1La18mM1lJUr0yOzoiYlJETG87Jg2s+0wg\nS3w0M1lJUr06zGQzczIweYAvezQiNszMOdV08GNV+2xgXNt5Y6u22cCui7X/enmdmMlKkrrRBcDC\nCuFDgPPb2j9UVRnvDDxVTStfBuwREaOqgqc9qrZlMpOVJNWr8JpsREyjlYW+PCIeolUl/A3gnIg4\nHHgA2L86/WLgncAs4FngMIDM/EtE/DtwY3XeVzJz8WKqf2CQlSTVq/COT5l50FJ+tfsSzk3g40t5\nnynAlIH0bZCVJNXLHZ8kSdJAmclKkuqVRa6eGRIMspKkejV4utggK0mql0FWkqRCGnw/WQufJEkq\nxExWklSr7LPwSZKkMlyTlSSpkAavyRpkJUn1avB0sYVPkiQVYiYrSaqXa7KSJBVikJUkqZAG713s\nmqwkSYWYyTbMMV87gd/89gbWHbUOPz/9BwB853uncPVvr2f4asMZN2ZDjv/CUbxsrZcCcPKpZ/Oz\nX1xGz7BhfP5TH+ONr98BgFPPOo+fXngpEcEWm43n+C8cxciRI2r7XNLS7LnHrpxwwlfoGTaMKT+a\nxre+/f26h6SBavB0sZlsw+z7zrfzgxOOf1HbG163Peed9gPOO/Ukxo8bwymnnQ3AH+97gEuuvJrz\nT/8BPzjheP79O9+jt7eXRx9/gjN+cj5nTzmRn5/+A/r6+rjkiqvr+DjSMg0bNowT//OrvHuvD7LN\ndrtxwAH78upXb1H3sDRQfdnZMYQZZBtmxwnbsPbL1npR2xtfvwPDh/cAsO3WW/LoY08A8KtrrmPi\n7m9hxIgRjN1oNK8cuxEz7/o9AAt6e5k7dx4LFvTy3PNzecXL1x3cDyL1w06v254//vF+7rvvT8yf\nP59zzjmfvffas+5haaCyr7NjCCs+XRwR2wLj2/vKzJ+V7ldLdt5Fv+Qdu78FgMce/zPbvmbLRb/b\nYP2X89jjTzDhNa/m0IP2423v/RCrjxzBLq977aJpZGko2WjMaB586OFFzx+aPYedXrd9jSPSChni\n2WgnimayETEFmALsB+xVHe9exvmTImJ6REw/5dRpJYfWlX44dRo9PT28e4/dlnneU0//jauuuY7L\nzv0Rvzr/DJ57fi4XXvarQRqlJDVH6Ux258zcqr8nZ+ZkYDLA/Cfube4/bWrw84su5ze/vYFTTvw6\nEQHA+q9Yj0cefXzROY8+9gTrv+LlXDd9BmM22oB1R60DwO5v2YUZM+9krz3fWsvYpaV5ePYjjBu7\n0aLnY8dsyMMPP1LjiLQi0sKnFXZtRPQ7yKqM/71uOlPOPJf/+uaXWGP11Re17/amnbnkyquZN28e\nDz38CH966GG2efWr2HCDV3Db7Xfz3PPPk5lcP30Gm248rsZPIC3ZjdNnsPnmmzB+/DhWW2019t9/\nHy78xS/rHpYGqsGFT6Uz2VNpBdpHgLlAAJmZ2xbut2t95kvf4MZbbuPJJ59m930/yD8ffjCnnHY2\n8+bP5yNHfhFoFT996bNHsPmmG7PnW9/M3h/4KMN7evjiUf9MT08P2269JW/f7U3sf9gR9PT0sOWr\nNuP9+0ys+ZNJ/6i3t5dPHnkMF190Jj3DhvHjqWdz552/r3tYGqghXrzUiciCO21ExCzgKGAmsOj/\nYmY+sLzXOl2sJlhjozfXPQRppVgwb3aUeu9njv9gR3/v1zzm9GJj61TpTPbxzLygcB+SpFXZEJ/y\n7UTpIHtLRJwJXEhruhjwEh5JUpsGFz6VDrJr0Aque7S1JWCQlSS1mMmumMw8rOT7S5IaoMGFT0WD\nbESsDhwObA0sunYkMz9csl9JkoaC0tfJngaMBvYErgbGAn8r3KckaVXidbIrbPPMfH9E7JOZU6si\nqGsK9ylJWoU0ecen0kF2fvXzyYh4DfAIsH7hPiVJq5Ihno12onSQnRwRo4BjgAuAlwL/VrhPSdKq\nxCC7wk6jdQee8cDUqm2Dwn1KkjQklA6y5wNPATfRthmFJEmLeAnPChubme8o3IckaVXmdPEK+11E\nbJOZMwv3I0laRaVBdmAiYiat7ROHA4dFxL14qztJUpcplcm+u9D7SpKaxkx2YPpzv1hJkgDvwiNJ\nUjFmspIkFdLgIFv6BgGSJHUtM1lJUq0ym5vJGmQlSfVq8HSxQVaSVC+DrCRJZTR5xycLnyRJKsRM\nVpJUrwZnsgZZSVK9mrvhk0FWklQv12QlSdKAmclKkurV4EzWICtJqpdrspIkldHkNVmDrCSpXg3O\nZC18kiSpEDNZSVKtmjxdbCYrSapXX4fHckTE/4mIGW3H0xFxZER8OSJmt7W/s+01n4+IWRFxT0Ts\nuaIfzUxWklSrLLwmm5n3ABMAIqIHmA2cBxwGfDczv9N+fkRsBRwIbA1sBFwREa/KzN6B9m0mK0mq\nV+FMdjG7A3/MzAeWcc4+wFmZOTcz7wNmATsNuCcMspKkVVxETIqI6W3HpGWcfiAwre35JyLitoiY\nEhGjqrYxwINt5zxUtQ2YQVaSVKvs6/DInJyZO7Ydk5fUT0SMAPYGzq2aTgI2ozWVPAf4j5X92VyT\nlSTVa/Cuk50I3JyZjwIs/AkQEScDv6iezgbGtb1ubNU2YGaykqRadZrJDsBBtE0VR8SGbb97D3B7\n9fgC4MCIGBkRmwBbADesyGczk5UkNV5ErAm8HfhoW/O3ImICkMD9C3+XmXdExDnAncAC4OMrUlkM\nBllJUs1KX8IDkJnPAOst1nbwMs7/KvDVTvs1yEqSajUYQbYuBllJUr0y6h5BMQZZSVKtmpzJWl0s\nSVIhZrKSpFpln9PFkiQV0eTpYoOsJKlWaeGTJEllNDmTtfBJkqRCzGQlSbWy8EmSpEIy6x5BOQZZ\nSVKtmpzJuiYrSVIhZrKSpFo1OZM1yEqSauWarCRJhZjJSpJUSJN3fLLwSZKkQsxkJUm1avK2igZZ\nSVKt+ho8XWyQlSTVqslrsksNshFxIbDUwurM3LvIiCRJXaVbq4u/M2ijkCSpgZYaZDPz6oWPI2IN\n4JWZec+gjEqS1DWavBnFci/hiYi9gBnApdXzCRFxQemBSZK6Q/ZFR8dQ1p/Cpy8DOwG/BsjMGRGx\nScExSZK6SJOri/uzGcX8zHxqsbYGJ/eSJK0c/clk74iIfwJ6ImIL4F+A35UdliSpWzT5Ep7+ZLJH\nAFsDc4FpwNPAkSUHJUnqHpmdHUPZcjPZzHwW+GJ1SJK0UjV5TXa5QTYirmIJa7CZ+dYiI5IkdZUm\nTxf3Z032022PVwf2AxaUGY4kSc3Rn+nimxZr+m1E3FBoPJKkLjPU11U70Z/p4nXbng4DdgDWLjai\nylpjdy3dhVTchPU2rXsI0pDX1WuywE201mSD1jTxfcDhJQclSeoe3b4m++rMfL69ISJGFhqPJKnL\nNDmT7c91skvaeOLalT0QSZKaZln3kx0NjAHWiIjtaU0XA7wMeMkgjE2S1AUaXPe0zOniPYFDgbHA\nf/BCkH0a+ELZYUmSukWTp4uXdT/ZqcDUiNgvM386iGOSJHWRJhc+9WdNdoeIWGfhk4gYFRHHFxyT\nJEmN0J8gOzEzn1z4JDP/Cryz3JAkSd2kr8NjKOvPJTw9ETEyM+cCRMQagJfwSJJWiqS508X9CbJn\nAFdGxI9oFT8dCkwtOShJUvfoa3B5cX/2Lv5mRNwKvI1WpfVlwMalByZJ6g59Dc5k+7MmC/AorQD7\nfuCtwF3FRiRJUkMsazOKVwEHVccTwNlAZOZugzQ2SVIX6NY12buBa4B3Z+YsgIj41KCMSpLUNYZ6\nhXAnljVd/F5gDnBVRJwcEbtDg/+5IUmqRRIdHUPZUoNsZv48Mw8EtgSuAo4E1o+IkyJij8EaoCRJ\nq6rlFj5l5jOZeWZm7kVrH+NbgM8VH5kkqSt0+2YUi1S7PU2uDkmSOjbUA2UnBhRkJUla2Yb6umon\nDLKSpFr1NTfG9nszCkmSNEBmspKkWjV5W0WDrCSpVg2+P4BBVpJUryZXF7smK0mqVV9ER0d/RMT9\nETEzImZExPSqbd2IuDwi/lD9HFW1R0ScGBGzIuK2iHjtin42g6wkqVvslpkTMnPH6vnRwJWZuQVw\nZfUcYCKwRXVMAk5a0Q4NspKkWmWHRwf2AaZWj6cC+7a1n5ot1wHrRMSGK9KBQVaSVKtB2lYxgV9G\nxE0RMalq2yAz51SPHwE2qB6PAR5se+1DVduAWfgkSapVp5tRVEFzUlvT5MxcfPvfN2Xm7IhYH7g8\nIu5u/2VmZkSs9EJng6wkaZVWBdRl7qmfmbOrn49FxHnATsCjEbFhZs6ppoMfq06fDYxre/nYqm3A\nnC6WJNWqj+joWJ6IWDMi1lr4GNgDuB24ADikOu0Q4Pzq8QXAh6oq452Bp9qmlQfETFaSVKtB2Ixi\nA+C8aF3uMxw4MzMvjYgbgXMi4nDgAWD/6vyLgXcCs4BngcNWtGODrCSpVqVvEJCZ9wLbLaH9z8Du\nS2hP4OMro2+DrCSpVu74JEmSBsxMVpJUK28QIElSIU2+abtBVpJUqyavyRpkJUm1anKQtfBJkqRC\nzGQlSbVK12QlSSqjydPFBllJUq2aHGRdk5UkqRAzWUlSrdyMQpKkQtyMQpKkQpq8JmuQlSTVqslB\n1sInSZIKMZOVJNXKwidJkgqx8EmSpEKavCZrkJUk1arJ08UWPkmSVIiZrCSpVn0NzmUNspKkWrkm\nK0lSIc3NY12TlSSpGDNZSVKtnC6WJKkQN6OQJKkQq4slSSqkuSHWwidJkooxk5Uk1crCJ0mSCnFN\nVpKkQpobYg2ykqSaNXm62MInSZIKMZOVJNXKNVlJkgppbog1yEqSauaarCRJGjAzWUlSrbLBE8YG\nWUlSrZo8XWyQlSTVyupiSZIKaW6ItfBJkqRiDLJdYuzYDbnssrO45ZYrufnmK/j4xz8MwNe+9gVu\nvfVX3HjjZZx99mTWXvtlNY9U+kfHnnA0v5x5AWdfNfVF7Qd8eD9+cs3pnP3rU/mXYz4GwNYTXs0Z\nl0/hjMuncOYVP2LXiW+uY8gagD6yo2Moi8yhOcDVV3/l0BzYKmr06PUZPXp9Zsy4nZe+dE2uvfYi\n3v/+jzB27Giuuup39Pb2cvzxnwfgmGO+XvNom+M1ozauewiNsP3O2/HsM8/xlRO/yAG7HQLADrts\nz4c/+SGOPPizzJ83n1HrrcNf//wkI9cYyYJ5C+jt7WW99ddj2pU/YuKE99Db21vzp1i1TZ9zTZR6\n74+Mf39Hf+9Pvv/cYmPrlJlsl3jkkceYMeN2AP7+92e4++5ZjBkzmiuuuGbRH58bbriZsWNH1zlM\naYluue5Wnv7r0y9qe98h+zL1e6czf958AP765ycBmPvc3EXf6ZEjRzBUEwm9IDv8bygrWvgUET3A\nu4Dx7X1l5gkl+9WybbzxWCZM2JobbrjlRe2HHHIAP/nJhTWNShqYV246jgmv345/PnoSc+fO4z+P\n+z533no3AFtvvxXHfvdoNhy7AccecbxZ7BDX5Et4SmeyFwKHAusBa7UdSxQRkyJiekRM7+39e+Gh\ndac113wJ06b9kE9/+jj+9rcX/h9/7nOfYMGCBUybdl6No5P6b/jwHtZe52Uc+q6PcuJX/puvTz5u\n0e/uuOVODtj1Q3xo4iQOO+KDjBg5osaRqpuVvoRnbGZu29+TM3MyMBlcky1h+PDhnHXWDznrrPM4\n//xLF7UffPD7mDhxdyZOPKjG0UkD8+icx/nVxVcDcMeMu8i+ZJ311uHJatoY4P4/PMCzzzzHZltu\nwl233lPXULUcQ33KtxOlM9lLImKPwn2on374w29z992zOPHEUxa1vf3tb+Gooz7G+953OM8993yN\no5MG5upLr2HHN74WaE0dD19tOE/++Uk2GrchPT09AIweuwHjN9+Yhx98pM6hajn6OjyGstKZ7HXA\neRExDJgPBJCZ6XUig2yXXV7HBz6wHzNn3sX1118CwLHHfosTTjiOkSNHcNFFZwBwww23cMQRX6hz\nqNI/+Op/f4kddtmeddZdm4tu+imTvzOF86ddxLHf/TxnXzWV+fMX8OVPfg2ACa/flkM+8QEWzF9A\nZvKNz5/AU395quZPoGXpa3BxWtFLeCLiPmAfYGYOsCOni9UEXsKjpih5Cc/BG7+3o7/3pz3wsyF7\nCU/pTPZB4PaBBlhJUvdocoAoHWTvBX4dEZcAcxc2egmPJGmhob5rUydKB9n7qmNEdUiS9CJNri4u\nGmQz87jlnyVJ6mZDvUK4E6V3fHoF8Flga2D1he2Z+daS/UqSNBSUvk72DOBuYBPgOOB+4MbCfUqS\nViFNvgtP6SC7Xmb+DzA/M6/OzA8DZrGSpEVK3yAgIsZFxFURcWdE3BERn6zavxwRsyNiRnW8s+01\nn4+IWRFxT0TsuaKfrXTh0/zq55yIeBfwMLBu4T4lSauQQViTXQD8a2beHBFrATdFxOXV776bmd9p\nPzkitgIOpLXUuRFwRUS8KjMHfKeJ0kH2+IhYG/hX4L+AlwGfKtynJGkVUnorhcycA8ypHv8tIu4C\nxizjJfsAZ2XmXOC+iJgF7ARcO9C+i04XZ+YvMvOpzLw9M3fLzB0y84KSfUqSukv7HdyqY9Iyzh0P\nbA9cXzV9IiJui4gpETGqahtDazOlhR5i2UF5qYoG2YjYNCIujIgnIuKxiDg/IjYt2ackadXSaeFT\nZk7OzB3bjslL6iciXgr8FDgyM58GTgI2AybQynT/Y2V/ttKFT2cC5wCjac1rnwtMK9ynJGkVMhh3\n4YmI1WgF2DMy82cAmfloZvZmZh9wMq0pYYDZwLi2l4+t2gasdJB9SWaelpkLquN02q6XlSRpEKqL\nA/gf4K72bX0jYsO2094D3F49vgA4MCJGRsQmwBbADSvy2UoXPl0SEUcDZ9HaA/oA4OKIWBcgM/9S\nuH9Jkt4IHAzMjIgZVdsXgIMiYgKt+HQ/8FGAzLwjIs4B7qRVmfzxFakshvJBdv/q50d54UYLQas0\nOgHXZyWpy5XeUCIz/5dW7Fncxct4zVeBr3bad+np4s8B22XmJsCPgFuB/TJzk8w0wEqSyMyOjqGs\ndJA9JjOfjog30drp6RRa1VySJAGDU/hUl9JBduEc9ruAkzPzIrzlnSSpTenCpzqVDrKzI+KHvFDw\nNHIQ+pQkaUgoHfD2By4D9szMJ2ntW/yZwn1KklYhTb4LT+mbtj8L/Kzt+aL9IyVJgvJ7F9ep9CU8\nkiQt01DPRjvh+qgkSYWYyUqSajXUK4Q7YZCVJNWqzzVZSZLKaG6INchKkmpm4ZMkSRowM1lJUq2a\nnMkaZCVJtXIzCkmSCjGTlSSpkCZfJ2vhkyRJhZjJSpJq5ZqsJEmFuCYrSVIhTc5kXZOVJKkQM1lJ\nUq2cLpYkqZAmX8JjkJUk1cpb3UmSVEiTM1kLnyRJKsRMVpJUK6eLJUkqpMnTxQZZSVKtzGQlSSqk\nyZmshU+SJBViJitJqpXTxZIkFdLk6WKDrCSpVpl9dQ+hGNdkJUkqxExWklQr78IjSVIhTb5pu0FW\nklQrM1lJkgppciZr4ZMkSYWYyUqSauVmFJIkFeJmFJIkFdLkNVmDrCSpVk2uLrbwSZKkQsxkJUm1\ncrpYkqRCrC6WJKmQJmeyrslKklSImawkqVZNri42yEqSatXk6WKDrCSpVhY+SZJUSJO3VbTwSZKk\nQsxkJUm1crpYkqRCLHySJKkQ12QlSSokMzs6+iMi3hER90TErIg4uvBHWsQgK0lqtIjoAb4PTAS2\nAg6KiK0Go2+niyVJtRqENdmdgFmZeS9ARJwF7APcWbpjM1lJUq2yw6MfxgAPtj1/qGorbshmss8/\n/6eoewxNFxGTMnNy3eOQOuV3edW2YN7sjv7eR8QkYFJb0+Sh8n0wk+1uk5Z/irRK8LvcxTJzcmbu\n2HYsHmBnA+Pano+t2oozyEqSmu5GYIuI2CQiRgAHAhcMRsdDdrpYkqSVITMXRMQngMuAHmBKZt4x\nGH0bZLvbkFizkFYCv8tapsy8GLh4sPuNJm9nJUlSnVyTlSSpEINsA0XE+Ii4ve5xSFK3M8hKklSI\nQba5eiLi5Ii4IyJ+GRFrRMRHIuLGiLg1In4aES8BiIgfR8RJEXFdRNwbEbtGxJSIuCsiflzz51CX\niYg1I+Ki6nt6e0QcEBH3R8S3ImJmRNwQEZtX5+4VEddHxC0RcUVEbFC1fzkipkbENRHxQES8t+31\nl0bEavV+SnULg2xzbQF8PzO3Bp4E9gN+lpmvy8ztgLuAw9vOHwW8AfgUrevHvgtsDWwTERMGdeTq\ndu8AHs7M7TLzNcClVftTmbkN8D3g/1Vt/wvsnJnbA2cBn217n82AtwJ7A6cDV1Wvfw54V/mPIRlk\nm+y+zJxRPb4JGA+8pvqX/UzgA7SC6EIXZqvUfCbwaGbOzMw+4I7qtdJgmQm8PSK+GRFvzsynqvZp\nbT/fUD0eC1xWfac/w4u/05dk5vzq/Xp4IVjPxO+0BolBtrnmtj3upXVN9I+BT1T/mj8OWH0J5/ct\n9to+vJ5agygzfw+8llYwPD4ijl34q/bTqp//BXyv+k5/lCV8p6t/LM7PF65X9DutQWOQ7S5rAXOq\n9agP1D0YaUkiYiPg2cw8Hfg2rYALcEDbz2urx2vzwh60hwzaIKV+8l9z3eXfgOuBx6ufa9U7HGmJ\ntgG+HRF9wHzgY8BPgFERcRutDPWg6twvA+dGxF+BXwGbDP5wpaVzxydJQ15E3A/smJlP1D0WaSCc\nLpYkqRAzWUmSCjGTlSSpEIOsJEmFGGSlQiJi64jYu+5xSKqPQVbqp4jojYgZ1X665y7c+3kp574S\n+CLw66X8fteI+EX1eO+IOLp6vG9EbFVg+JJqYJCV+u+5zJxQ7ac7D/i/7b+MlmEAmfmnzPynzHx6\neW+amRdk5jeqp/sCBlmpIQyy0oq5Bti8unfvPRFxKnA7MC4i9oiIayPi5irjfSlARLwjIu6OiJuB\n9y58o4g4NCK+FxG70NrM/ttVxrxZHR9M0spjkJUGKCKGAxNp7a0LrTse/Xd1x6NngGOAt2Xma4Hp\nwFERsTpwMrAXsAMwevH3zczf0boD0meqjPmPxT+MpKLcVlHqvzUiYuGdja4B/gfYCHggM6+r2nem\nNd3724gAGEFrn90tad0Z6Q8AEXE6MGkQxy6pBgZZqf+ey8wX3Vu3CqTPtDcBl2fmQYud5z15pS7k\ndLG0cl0HvDEiNgeIiDUj4lXA3cD4tnXWg5by+r/hjRukxjDISitRZj4OHApMq+4Ycy2wZWY+T2t6\n+KKq8OmxpbzFWcBnIuIWC5+kVZ97F0uSVIiZrCRJhRhkJUkqxCArSVIhBllJkgoxyEqSVIhBVpKk\nQgyykiQVYpCVJKmQ/w92vDj/dBxzaAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ttDjPV7E5hAQ" + }, + "source": [ + "#### 6.2.2. Classification par Linear SVC (svm)" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "YfmW2GnL5hAR", + "colab": {} + }, + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.svm import LinearSVC\n", + "\n", + "model = LinearSVC()\n", + "\n", + "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(featuresLem, labels, df_lem.index, test_size=0.33, random_state=0)\n", + "model.fit(X_train, y_train)\n", + "y_pred = model.predict(X_test)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "yg8lmpWk5hAT" + }, + "source": [ + "On remarque le f1-score du LinearSVC est améliorer comparant au Naive Bayes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tX3QDxsG7_1p", + "colab_type": "text" + }, + "source": [ + "#####6.2.2.1 f1-score" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "26886a4a-ce28-4269-d323-fe0f7e03844a", + "id": "1bI6ucAa5hAW", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + } + }, + "source": [ + "from sklearn import metrics\n", + "print(metrics.classification_report(y_test, y_pred, target_names=df_stem['Category'].unique()))" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ham 0.99 1.00 0.99 1597\n", + " spam 1.00 0.93 0.96 242\n", + "\n", + " accuracy 0.99 1839\n", + " macro avg 0.99 0.96 0.98 1839\n", + "weighted avg 0.99 0.99 0.99 1839\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "1cdc9cd7-8fe7-4967-8653-ca5ac3dd0470", + "id": "_YHFm-MZ5hAY", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 388 + } + }, + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "conf_mat = confusion_matrix(y_test, y_pred)\n", + "fig, ax = plt.subplots(figsize=(8,6))\n", + "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", + "plt.ylabel('Actuel')\n", + "plt.xlabel('Prédit')\n", + "plt.show()" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdkAAAFzCAYAAABywHOKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAeIElEQVR4nO3de7hd07n48e+bKEIrEdQlcUmJalGX\ntkpbqi51v5QqitI6jRZ1yqlWb6faOj38Slun1BHXBBUpekRbWkfV5ShCqbiLoJIitETrmmS/vz/W\nTGypJHvvlbHnztzfj2c+a66x5lpzLM968u53jHeOGZmJJEla9AbU3QFJkprKICtJUiEGWUmSCjHI\nSpJUiEFWkqRCDLKSJBWyRN0dmJ+Zz07x2iIt9gattmXdXZAWiVmvTYtSn93uv/dvWfEdxfrWrj4b\nZCVJ/UTH7Lp7UIzDxZIkFWImK0mqV3bU3YNiDLKSpHp1GGQlSSoiG5zJOicrSVIhZrKSpHo5XCxJ\nUiENHi42yEqS6tXg62QNspKkejU4k7XwSZKkQsxkJUn1svBJkqQymnydrEFWklQvM1lJkgppcCZr\n4ZMkqdEi4tyImB4R93RqOz4ipkXEXdW2c6fXvhYRkyPiwYjYoVP7jlXb5Ig4rivnNpOVJNWr/HWy\n5wOnAWPnaf9RZp7cuSEi3g3sB6wPrAb8b0SsW718OrA9MBWYGBETMvO+BZ3YICtJqlfh4eLMvCEi\n1uri4XsA4zLzVeDRiJgMbFa9NjkzpwBExLjq2AUGWYeLJUn16uhoa4uIURFxe6dtVBfPfGRE3F0N\nJy9ftQ0Dnuh0zNSqbX7tC2SQlSQt1jJzdGa+r9M2ugtvOwNYG9gYeBI4pUTfHC6WJNWrhurizHx6\nzn5EnAX8sno6DVi906HDqzYW0D5fZrKSpHq1OVzcExGxaqenHwfmVB5PAPaLiKUiYgQwErgNmAiM\njIgREbEkreKoCQs7j5msJKlWmWWriyPiYmBrYMWImAp8G9g6IjYGEngMOKzVl7w3IsbTKmiaBRyR\nVQcj4kjgN8BA4NzMvHeh587MRf6FFoWZz07pmx2TumHQalvW3QVpkZj12rQo9dmv3PXLtv69X3rj\nXYv1rV0OF0uSVIjDxZKkerl2sSRJhTR47WKDrCSpXuWXVayNQVaSVK8GZ7IWPkmSVIiZrCSpXhY+\nSZJUSIOHiw2ykqR6NTiTdU5WkqRCzGQlSfVqcCZrkJUk1ar0DQLqZJCVJNXLTFaSpEIaXF1s4ZMk\nSYWYyUqS6uVwsSRJhTR4uNggK0mql5msJEmFNDiTtfBJkqRCzGQlSfVyuFiSpEIMspIkFeKcrCRJ\n6i4zWUlSvRwuliSpkAYPFxtkJUn1MpOVJKmQBmeyFj5JklSImawkqV4OF0uSVIhBVpKkQjLr7kEx\nBllJUr0anMla+CRJUiFmspKkejU4kzXISpLq1eDrZA2ykqR6NTiTdU5WkqRCzGQlSfXyEh5Jkgpp\n8HCxQVaSVC+DrCRJhTS4utjCJ0mSCjGTlSTVKjssfJIkqQznZCVJKqTBc7IGWUlSvRo8XGzhkyRJ\nhZjJSpLq5ZysJEmFGGQlSSqkwWsXOycrSVIhBtmG+eb3f8hWu+zHngd+fm7b6edcyDZ7HMjeBx/B\n3gcfwQ033wbAzJkz+eZ//JCPH/QF9jr4cG77491z3zNz5kyOP+lUdtnvX9ht/89xzXU39fp3kRbm\nrNGn8Jepf+KuO6+tuytqR0dHe9tCRMS5ETE9Iu7p1PaDiHggIu6OiF9ExJBOr30tIiZHxIMRsUOn\n9h2rtskRcVxXvppBtmH23Hl7/vuHJ/xT+0H77sllY07nsjGns9UHNwPg0glXA/CLC87grB9/n5NP\nO4uO6gd75phxDF1+CL8adzZXXHQm79tkw977ElIXjR07nl12PaDubqhdHdnetnDnAzvO03YNsEFm\nvgd4CPgaQES8G9gPWL96z08jYmBEDAROB3YC3g3sXx27QAbZhnnfxhsyeLm3denYRx77M5u9dyMA\nVlh+CG9767Lc+8DDAPziV7/lXw7aF4ABAwaw/JDBZTosteHGm27lb889X3c31K7saG9b2Mdn3gD8\nbZ6232bmrOrpLcDwan8PYFxmvpqZjwKTgc2qbXJmTsnM14Bx1bELVLzwKSLeA6zV+VyZeXnp8+qN\nLr7sSiZcfS3rrzeSY4/8HIOXexvvXGcEv7/pFnbebmuemv4M9z04maeefoY1Vx8GwGlnjWXinXez\n+rBV+foxh7Pi0OVr/haSGqn+xSg+C1xS7Q+jFXTnmFq1ATwxT/sHFvbBRTPZiDgXOBfYG9it2nZd\nwPGjIuL2iLj97LEXl+xav7Lvx3fhqvHnctn5p7PSCkP5wWlnAfDxXXZg5ZVWZN9Dj+KkU89k4w3e\nxYCBA5g9ezZPT3+WjTd8Fz8/7zQ22uBdnHza2TV/C0l6c51jR7WN6sZ7vwHMAi4q0bfSmezmmbnQ\nMes5MnM0MBpg5rNTav/Tpik6Z6Cf2H0njjj22wAsscRAvvqvh8197YDDjmGt1YcxZPByDFp6Kbb7\nyIcA+NhHt+TyK3/Tu52W1G9km9fJdo4d3RERh9BK/LbNnHsd0TRg9U6HDa/aWED7fJWek/1DVyaG\nVdYzz74+FXHt9TezzjvWBODlV17hpZdfAeDm2/7IEgMHsvaINYkIPvKhDzDxzla18a2338XaI9bo\n/Y5L6h/KFz79k4jYEfgKsHtmvtTppQnAfhGxVESMAEYCtwETgZERMSIilqRVHDVhYecpncmOpRVo\nnwJeBQLIqppLBRz77ROZeOfdPP/8C2y754EcfuhBTLzzbh58eAoEDFtlZb79laMA+NtzMzjs6G8Q\nAwaw8kor8J///uW5n3PM4Z/la989mRNPPZOhQwZzwtePqesrSfN14QWn85GttmDFFYfy2JTb+c53\nT+a888fV3S11V+G78ETExcDWwIoRMRX4Nq1q4qWAayIC4JbM/Hxm3hsR44H7aA0jH5GZs6vPORL4\nDTAQODcz713oubPgShsRMRk4BpgEzP2/mJmPL+y9DherCQattmXdXZAWiVmvTYtSn/3iCQe29e/9\nst+8sFjf2lU6k30mMxeaTkuS+rH6q4uLKR1k74yInwFX0houBryER5LUiTcI6LFBtILrxzq1JWCQ\nlSS1mMn2TGZ+puTnS5IaoHDhU52KBtmIWBo4lNYakEvPac/Mz5Y8ryRJfUHp62QvAFYBdgCup3Xx\n7t8Ln1OStDip4TrZ3lJ6TnadzNwnIvbIzDFVEdSNhc8pSVqMtLviU19WOsjOrB6fj4gNgKeAtxc+\npyRpcdLHs9F2lA6yoyNieeCbtJafeivwrcLnlCQtTgyyPXYBrTvwrAWMqdpWLnxOSZL6hNJB9gpg\nBnAHnRajkCRpLi/h6bHhmblj4XNIkhZnDhf32M0RsWFmTip8HknSYioNst0TEZNoLZ+4BPCZiJiC\nt7qTJPUzpTLZXQt9riSpacxku6cr94uVJAnwLjySJBVjJitJUiENDrKlbxAgSVK/ZSYrSapVZnMz\nWYOsJKleDR4uNshKkuplkJUkqYwmr/hk4ZMkSYWYyUqS6tXgTNYgK0mqV3MXfDLISpLq5ZysJEnq\nNjNZSVK9GpzJGmQlSfVyTlaSpDKaPCdrkJUk1avBmayFT5IkFWImK0mqlcPFkiSV0uDhYoOsJKlW\naZCVJKmQBgdZC58kSSrETFaSVCuHiyVJKsUgK0lSGU3OZJ2TlSSpEDNZSVKtmpzJGmQlSbUyyEqS\nVEpG3T0oxiArSapVkzNZC58kSSrETFaSVKvscLhYkqQimjxcbJCVJNUqLXySJKmMJmeyFj5JklSI\nmawkqVZNLnwyk5Uk1Sqzva0rIuJfI+KeiLg3Ir5UtQ2NiGsi4uHqcfmqPSLivyJickTcHRGb9vS7\nGWQlSbXKjmhrW5iI2AD4HLAZsBGwa0SsAxwHXJuZI4Frq+cAOwEjq20UcEZPv5tBVpLUdO8Cbs3M\nlzJzFnA9sBewBzCmOmYMsGe1vwcwNltuAYZExKo9ObFBVpJUq3Yz2YgYFRG3d9pGzXOKe4AtI2KF\niFgG2BlYHVg5M5+sjnkKWLnaHwY80en9U6u2brPwSZJUq67Oq87//TkaGL2A1++PiJOA3wIvAncB\ns+c5JiOizZ78MzNZSVKtSs/JAmTmOZn53szcCngOeAh4es4wcPU4vTp8Gq1Md47hVVu3GWQlSbXK\njLa2roiIt1ePa9Caj/0ZMAE4uDrkYOCKan8C8OmqynhzYEanYeVucbhYktQfXBYRKwAzgSMy8/mI\nOBEYHxGHAo8Dn6yO/TWtedvJwEvAZ3p6UoOsJKlWvbGsYmZu+SZtfwW2fZP2BI5YFOc1yEqSatXh\nDQIkSSqjX96FJyKuBOZbzpyZuxfpkSSpX2ny2sULymRP7rVeSJLUQPMNspl5/Zz9iBgErJGZD/ZK\nryRJ/Ua7i1H0ZQu9TjYidqO1OsbV1fONI2JC6Y5JkvqH3liMoi5dKXw6ntadC34PkJl3RcSIgn2S\nJPUjTa4u7sqKTzMzc8Y8bQ1O7iVJWjS6ksneGxGfAgZGxEjgKODmst2SJPUXTb6EpyuZ7BeB9YFX\ngYuBF4AvleyUJKn/yGxv68sWmslm5kvAN6pNkqRFqslzsgsNshFxHW8yB5uZ2xTpkSSpX2nycHFX\n5mS/3Gl/aWBvYFaZ7kiS1BxdGS6+Y56m/4uI2wr1R5LUz/T1edV2dGW4eGinpwOA9wKDi/Wosuyw\nrUqfQipukxXXrrsLUp/Xr+dkgTtozckGrWHiR4FDS3ZKktR/9Pc52Xdl5iudGyJiqUL9kST1M03O\nZLtyneybLTzxh0XdEUmSmmZB95NdBRgGDIqITWgNFwMsByzTC32TJPUDDa57WuBw8Q7AIcBw4BRe\nD7IvAF8v2y1JUn/R5OHiBd1PdgwwJiL2zszLerFPkqR+pMmFT12Zk31vRAyZ8yQilo+IEwr2SZKk\nRuhKkN0pM5+f8yQznwN2LtclSVJ/0tHm1pd15RKegRGxVGa+ChARgwAv4ZEkLRJJc4eLuxJkLwKu\njYjzaBU/HQKMKdkpSVL/0dHg8uKurF18UkT8CdiOVqX1b4A1S3dMktQ/dDQ4k+3KnCzA07QC7D7A\nNsD9xXokSVJDLGgxinWB/avtWeASIDLzo73UN0lSP9Bf52QfAG4Eds3MyQARcXSv9EqS1G/09Qrh\ndixouHgv4Enguog4KyK2hQb/uSFJqkUSbW192XyDbGb+T2buB6wHXAd8CXh7RJwRER/rrQ5KkrS4\nWmjhU2a+mJk/y8zdaK1jfCfw1eI9kyT1C/19MYq5qtWeRlebJElt6+uBsh3dCrKSJC1qfX1etR0G\nWUlSrTqaG2O7vBiFJEnqJjNZSVKtmrysokFWklSrBt8fwCArSaqX1cWSJBXSEc0dLrbwSZKkQsxk\nJUm1ck5WkqRCnJOVJKkQF6OQJEndZiYrSaqVi1FIklSIhU+SJBXS5DlZg6wkqVZNri628EmSpELM\nZCVJtXJOVpKkQpo8J+twsSSpVh1tbl0REUMi4tKIeCAi7o+ILSJiaERcExEPV4/LV8dGRPxXREyO\niLsjYtOefjeDrCSpVr0RZIFTgaszcz1gI+B+4Djg2swcCVxbPQfYCRhZbaOAM3r63QyykqRGi4jB\nwFbAOQCZ+VpmPg/sAYypDhsD7Fnt7wGMzZZbgCERsWpPzm2QlSTVKqO9LSJGRcTtnbZR85xiBPAM\ncF5E3BkRZ0fEssDKmflkdcxTwMrV/jDgiU7vn1q1dZuFT5KkWrV7nWxmjgZGL+CQJYBNgS9m5q0R\ncSqvDw3P+YyMiEVe6GwmK0mqVS/MyU4FpmbmrdXzS2kF3afnDANXj9Or16cBq3d6//CqrdsMspKk\nRsvMp4AnIuKdVdO2wH3ABODgqu1g4IpqfwLw6arKeHNgRqdh5W5xuFiSVKteWozii8BFEbEkMAX4\nDK1Ec3xEHAo8DnyyOvbXwM7AZOCl6tgeMchKkmrVG4tRZOZdwPve5KVt3+TYBI5YFOc1yEqSatXk\nGwQYZCVJtWpykLXwSZKkQsxkJUm18i48kiQV0uS78BhkJUm1avKcrEFWklSrJg8XW/gkSVIhZrKS\npFp1NDiXNchKkmrlnKwkSYU0N491TlaSpGLMZCVJtXK4WJKkQlyMQpKkQqwuliSpkOaGWAufJEkq\nxkxWklQrC58kSSrEOVlJkgppbog1yEqSatbk4WILnyRJKsRMVpJUK+dkJUkqpLkh1iArSaqZc7KS\nJKnbzGQlSbXKBg8YG2QlSbVq8nCxQVaSVCuriyVJKqS5IdbCJ0mSijGT7SdGn3kyO++8Hc888yyb\nbLodABdd+FPWXXdtAAYPXo4ZM17g/ZvtUGc3pX+y8mpv5/hTv87QlYZCJr+48ErGnXMpR33rC2y5\n/QeZ+dospj4+je8efSL/eOEfrDp8FcZffwF/nvJnACbdcR8nHndKzd9CC+JwsRZ7Yy/4OT8943zO\nO/fHc9sOOPDwufsnnfQtXpjx9zq6Ji3QrFmz+fF3f8qDkx5imWUHMfbqs7n1honcesPtnP790cye\nPZsjv/F5DvnigZz2H/8NwLTHp3HA9ofW3HN1VZMLnxwu7iduuulWnnvu+fm+/om9d+OS8Vf0Yo+k\nrvnr9L/y4KSHAHjpxZd5bPLjrLTqStx6/URmz54NwD133MvKq65UZzfVhmzzv76saCYbEQOBXYC1\nOp8rM39Y8rzqng9/+ANMn/4Mkyc/WndXpAVadfgqvHODkdz7x/ve0L77/jtzzRW/m/t8tTVW5cLf\nns2Lf3+JM046m7tuu7u3u6puaHImW3q4+ErgFWASXfj/GBGjgFEAAwcOYcDAZcv2TgDsu+8eZrHq\n8wYtM4iTzv4eP/z3n/DiP16a2/6Zow5i1qzZXHX5NQA8O/2v7Pb+fZjx3Aust+G6nHze99l360+/\n4T1SbykdZIdn5nu6enBmjgZGAyy51PC+PQbQEAMHDmTPPXZi8y12rrsr0nwNXGIgJ539Pa6+/Bqu\nu+qGue27fnJHPrzdFhy+79Fz22a+NpMZr80E4IFJDzH1sWms8Y7Vuf/uB3u93+qavj7k247Sc7JX\nRcTHCp9Dbdh22y158MFHmDbtybq7Is3Xt075Ko89/Dg/Gz1+btsWW2/GQYd/in875Gu8+vKrc9uH\nDB3MgAGtf9qGrbEqq48YzrQ//6XX+6yu62hz68tKZ7K3AL+IiAHATCCAzMzlCp9X87hg7GlstdUW\nrLjiUKY8MpHvfu8Uzj9/HJ/cZ3cuGf8/dXdPmq+NNtuQXfbZkYfve4SLrjkHgNP/8yy+/L2jWHKp\nJTn9klaJx5xLdTbZfGM+f+xnmTVrFh0dyYnHncILz1s535d1ZHMz2ciCXy4iHgX2ACZlN0/kcLGa\nYKMV3lF3F6RFYuJfbohSn33Qmnu19e/9BY9fXqxv7SqdyT4B3NPdACtJ6j+aHCBKB9kpwO8j4ipg\n7qSJl/BIkuZwxaeee7Talqw2SZLeoMnVxUWDbGZ+p+TnS5IWf329QrgdpVd8Wgn4CrA+sPSc9szc\npuR5JUnqC0pfJ3sR8AAwAvgO8BgwsfA5JUmLkQ6yra0vKx1kV8jMc4CZmXl9Zn4WMIuVJM3lDQJ6\nbmb1+GRE7AL8BRha+JySpMWIc7I9d0JEDAb+DfgJsBxw9ILfIknqT5q8lELp6uJfVrszgI+WPJck\nSX1N0TnZiHhHRFwZEc9GxPSIuCIiXGdOkjSXhU899zNgPLAKsBrwc+DiwueUJC1GmnwXntJBdpnM\nvCAzZ1XbhXS6XlaSpNLVxRGxdETcFhF/ioh7I+I7VfuIiLg1IiZHxCURsWTVvlT1fHL1+lo9/W69\ncT/Z4yJirYhYMyK+Avw6IoZGhFXGkqTe8CqwTWZuBGwM7BgRmwMnAT/KzHWA54BDq+MPBZ6r2n9U\nHdcjpauLP1k9HsbrN1oIYL/qufOzktTPlZ5Xre4E94/q6VuqLWmt2/Cpqn0McDxwBq1btB5ftV8K\nnBYR0ZM7ypXOZL8KbJSZI4DzgD8Be2fmiMw0wEqSyMy2tq6IiIERcRcwHbgGeAR4PjNnVYdMBYZV\n+8No3aqV6vUZwAo9+W6lg+w3M/OFiPgwrb8Yzqb1V4IkSUD7hU8RMSoibu+0jZr3HJk5OzM3BoYD\nmwHrFf9ilB8unl097gKclZm/iogTCp9TkrQYaXdpxMwcDYzu4rHPR8R1wBbAkIhYospWhwPTqsOm\nAasDUyNiCWAw8Nee9K10JjstIs4E9qVV8LRUL5xTkqS5ImKliBhS7Q8CtgfuB64DPlEddjBwRbU/\noXpO9frvejIfC71T+LQjcHL118OqwLGFzylJWoz0woISqwJjImIgrURvfGb+MiLuA8ZVI6x3AudU\nx58DXBARk4G/0SrW7ZHSyyq+BFze6fmTwJMlzylJWryUXrs4M+8GNnmT9im05mfnbX8F2GdRnLt0\nJitJ0gL19aUR2+H8qCRJhZjJSpJq1ddvvN4Og6wkqVYd3k9WkqQymhtiDbKSpJpZ+CRJkrrNTFaS\nVKsmZ7IGWUlSrUovRlEng6wkqVZmspIkFdLk62QtfJIkqRAzWUlSrZyTlSSpEOdkJUkqpMmZrHOy\nkiQVYiYrSaqVw8WSJBXS5Et4DLKSpFp5qztJkgppciZr4ZMkSYWYyUqSauVwsSRJhTR5uNggK0mq\nlZmsJEmFNDmTtfBJkqRCzGQlSbVyuFiSpEKaPFxskJUk1Sqzo+4uFOOcrCRJhZjJSpJq5V14JEkq\npMk3bTfISpJqZSYrSVIhTc5kLXySJKkQM1lJUq1cjEKSpEJcjEKSpEKaPCdrkJUk1arJ1cUWPkmS\nVIiZrCSpVg4XS5JUiNXFkiQV0uRM1jlZSZIKMZOVJNWqydXFBllJUq2aPFxskJUk1crCJ0mSCmny\nsooWPkmSVIiZrCSpVg4XS5JUiIVPkiQV0uQ5WYOsJKlWTc5kLXySJKkQM1lJUq2anMkaZCVJtWpu\niIVo8l8QWrCIGJWZo+vuh9Quf8vqq5yT7d9G1d0BaRHxt6w+ySArSVIhBllJkgoxyPZvzmGpKfwt\nq0+y8EmSpELMZCVJKsQg20ARsVZE3FN3PySpvzPISpJUiEG2uQZGxFkRcW9E/DYiBkXE5yJiYkT8\nKSIui4hlACLi/Ig4IyJuiYgpEbF1RJwbEfdHxPk1fw/1MxGxbET8qvqd3hMR+0bEYxHx/yJiUkTc\nFhHrVMfuFhG3RsSdEfG/EbFy1X58RIyJiBsj4vGI2KvT+6+OiLfU+y3VXxhkm2skcHpmrg88D+wN\nXJ6Z78/MjYD7gUM7Hb88sAVwNDAB+BGwPrBhRGzcqz1Xf7cj8JfM3CgzNwCurtpnZOaGwGnAj6u2\nm4DNM3MTYBzwlU6fszawDbA7cCFwXfX+l4Fdyn8NySDbZI9m5l3V/h3AWsAG1V/2k4ADaAXROa7M\nVqn5JODpzJyUmR3AvdV7pd4yCdg+Ik6KiC0zc0bVfnGnxy2q/eHAb6rf9LG88Td9VWbOrD5vIK8H\n60n4m1YvMcg216ud9mfTuhnE+cCR1V/z3wGWfpPjO+Z5bwfeSEK9KDMfAjalFQxPiIh/n/NS58Oq\nx58Ap1W/6cN4k9909cfizHz9ekV/0+o1Btn+5W3Ak9V81AF1d0Z6MxGxGvBSZl4I/IBWwAXYt9Pj\nH6r9wcC0av/gXuuk1EX+Nde/fAu4FXimenxbvd2R3tSGwA8iogOYCXwBuBRYPiLuppWh7l8dezzw\n84h4DvgdMKL3uyvNnys+SerzIuIx4H2Z+WzdfZG6w+FiSZIKMZOVJKkQM1lJkgoxyEqSVIhBViok\nItaPiN3r7oek+hhkpS6KiNkRcVe1nu7P56z9PJ9j1wC+Afx+Pq9vHRG/rPZ3j4jjqv09I+LdBbov\nqQYGWanrXs7Mjav1dF8DPt/5xWgZAJCZf87MT2XmCwv70MyckJknVk/3BAyyUkMYZKWeuRFYp7p3\n74MRMRa4B1g9Ij4WEX+IiD9WGe9bASJix4h4ICL+COw154Mi4pCIOC0iPkhrMfsfVBnz2nV8MUmL\njkFW6qaIWALYidbautC649FPqzsevQh8E9guMzcFbgeOiYilgbOA3YD3AqvM+7mZeTOtOyAdW2XM\njxT/MpKKcllFqesGRcScOxvdCJwDrAY8npm3VO2b0xru/b+IAFiS1jq769G6M9LDABFxITCqF/su\nqQYGWanrXs7MN9xbtwqkL3ZuAq7JzP3nOc578kr9kMPF0qJ1C/ChiFgHICKWjYh1gQeAtTrNs+4/\nn/f/HW/cIDWGQVZahDLzGeAQ4OLqjjF/ANbLzFdoDQ//qip8mj6fjxgHHBsRd1r4JC3+XLtYkqRC\nzGQlSSrEICtJUiEGWUmSCjHISpJUiEFWkqRCDLKSJBVikJUkqRCDrCRJhfx/YJWjuRmq0BwAAAAA\nSUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "s2_y66pe5hAa" + }, + "source": [ + "On remarque un petit peu d'amélioration, alors SVM est un bon choix pour ce cas de dataSet " + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "outputId": "2112fb86-7a3b-4cce-8dca-a7447e844980", + "id": "m_oAPb_V5hAb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 619 + } + }, + "source": [ + "from IPython.display import display\n", + "\n", + "for predicted in category_id_df.category_id:\n", + " for actual in category_id_df.category_id:\n", + " if predicted != actual and conf_mat[actual, predicted] >= 6:\n", + " print(\"'{}' est prédit comme '{}' : {} exemples.\".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))\n", + " display(df_lem.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['Category', 'Message_lemmatized_before_stemming']])\n", + " print('')" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "stream", + "text": [ + "'spam' est prédit comme 'ham' : 17 exemples.\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryMessage_lemmatized_before_stemming
684spamHi I 'm sue . I 20 year old work lapdancer . I love sex . Text live - I 'm bedroom . text SUE 89555 . By TextOperator G2 1DA 150ppmsg 18+
731spamEmail AlertFrom : Jeri StewartSize : 2KBSubject : Low-cost prescripiton drvgsTo listen email call 123
1940spamMore people dogging area . Call 09090204448 join like minded guy . Why arrange 1 . There 's 1 evening . A£1.50 minAPN LS278BB
751spamDo realize 40 year , 'll thousand old lady running around tattoo ?
4213spamMissed call alert . These number called left message . 07008009200
4298spamthesmszone.com let send free anonymous masked messages..im sending message there..do see potential abuse ? ? ?
333spamCall Germany 1 penny per minute ! Call fixed line via access number 0844 861 85 85 . No prepayment . Direct access !
3864spamOh god ! I 've found number ! I 'm glad , text back xafter msg cst std ntwk chg £1.50
5449spamLatest News ! Police station toilet stolen , cop nothing go !
1430spamFor sale - arsenal dartboard . Good condition double treble !
2247spamHi ya babe x u 4goten bout ? ' scammer getting smart..Though regular vodafone , respond get prem rate msg/subscription . Other no used also . Beware !
3991spam( Bank Granite issue Strong-Buy ) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300 % *********** Nasdaq Symbol CDGT That $ 5.00 per..
3460spamNot heard U4 . Call night knickers . Make beg like U last time 01223585236 XX Luv Nikiyu4.net
2823spamROMCAPspam Everyone around responding well presence since warm outgoing . You bringing real breath sunshine .
2402spamBabe : U want dont u baby ! Im nasty thing 4 filthyguys . Fancy rude time sexy bitch . How go slo n hard ! Txt XXX SLO ( 4msgs )
3501spamDorothy @ kiefer.com ( Bank Granite issue Strong-Buy ) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300 % *********** Nasdaq Symbol CDGT That $ 5.00 per..
4821spamCheck Out Choose Your Babe Videos @ sms.shsex.netUN fgkslpoPW fgkslpo
\n", + "
" + ], + "text/plain": [ + " Category Message_lemmatized_before_stemming\n", + "684 spam Hi I 'm sue . I 20 year old work lapdancer . I love sex . Text live - I 'm bedroom . text SUE 89555 . By TextOperator G2 1DA 150ppmsg 18+ \n", + "731 spam Email AlertFrom : Jeri StewartSize : 2KBSubject : Low-cost prescripiton drvgsTo listen email call 123 \n", + "1940 spam More people dogging area . Call 09090204448 join like minded guy . Why arrange 1 . There 's 1 evening . A£1.50 minAPN LS278BB \n", + "751 spam Do realize 40 year , 'll thousand old lady running around tattoo ? \n", + "4213 spam Missed call alert . These number called left message . 07008009200 \n", + "4298 spam thesmszone.com let send free anonymous masked messages..im sending message there..do see potential abuse ? ? ? \n", + "333 spam Call Germany 1 penny per minute ! Call fixed line via access number 0844 861 85 85 . No prepayment . Direct access ! \n", + "3864 spam Oh god ! I 've found number ! I 'm glad , text back xafter msg cst std ntwk chg £1.50 \n", + "5449 spam Latest News ! Police station toilet stolen , cop nothing go ! \n", + "1430 spam For sale - arsenal dartboard . Good condition double treble ! \n", + "2247 spam Hi ya babe x u 4goten bout ? ' scammer getting smart..Though regular vodafone , respond get prem rate msg/subscription . Other no used also . Beware ! \n", + "3991 spam ( Bank Granite issue Strong-Buy ) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300 % *********** Nasdaq Symbol CDGT That $ 5.00 per.. \n", + "3460 spam Not heard U4 . Call night knickers . Make beg like U last time 01223585236 XX Luv Nikiyu4.net \n", + "2823 spam ROMCAPspam Everyone around responding well presence since warm outgoing . You bringing real breath sunshine . \n", + "2402 spam Babe : U want dont u baby ! Im nasty thing 4 filthyguys . Fancy rude time sexy bitch . How go slo n hard ! Txt XXX SLO ( 4msgs ) \n", + "3501 spam Dorothy @ kiefer.com ( Bank Granite issue Strong-Buy ) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300 % *********** Nasdaq Symbol CDGT That $ 5.00 per..\n", + "4821 spam Check Out Choose Your Babe Videos @ sms.shsex.netUN fgkslpoPW fgkslpo " + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "gzgUQ8ml5hAe", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 0, + "outputs": [] + } + ] +} \ No newline at end of file From eccbba570c890ae63555b648e5601206dedce585 Mon Sep 17 00:00:00 2001 From: anass236 Date: Mon, 13 Jan 2020 02:53:16 +0100 Subject: [PATCH 2/2] Delete [IA_Niv2]_NLP_TP_MARAH_Anass.ipynb --- [IA_Niv2]_NLP_TP_MARAH_Anass.ipynb | 2284 ---------------------------- 1 file changed, 2284 deletions(-) delete mode 100644 [IA_Niv2]_NLP_TP_MARAH_Anass.ipynb diff --git a/[IA_Niv2]_NLP_TP_MARAH_Anass.ipynb b/[IA_Niv2]_NLP_TP_MARAH_Anass.ipynb deleted file mode 100644 index 052398d4..00000000 --- a/[IA_Niv2]_NLP_TP_MARAH_Anass.ipynb +++ /dev/null @@ -1,2284 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - }, - "colab": { - "name": "[IA_Niv2] NLP_TP_MARAH_Anass.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true, - "include_colab_link": true - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B8hAa9wS67FD", - "colab_type": "text" - }, - "source": [ - "#**Introduction**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8HenIWJbpaCj", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import numpy as np\n", - "import pandas as pd" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "MraTQd-jpaCq", - "colab_type": "code", - "colab": {} - }, - "source": [ - "spam = pd.read_csv('spam.csv')" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mhcBYtTqpaCt", - "colab_type": "code", - "outputId": "1bde3f20-9e4c-481f-8ab6-61224706f41f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 195 - } - }, - "source": [ - "spam.head()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", - "
" - ], - "text/plain": [ - " Category Message\n", - "0 ham Go until jurong point, crazy.. Available only ...\n", - "1 ham Ok lar... Joking wif u oni...\n", - "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", - "3 ham U dun say so early hor... U c already then say...\n", - "4 ham Nah I don't think he goes to usf, he lives aro..." - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 5 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D93tkBtlp7uh", - "colab_type": "text" - }, - "source": [ - "**Le but** de cette démonstration est d'appliquer les principaux fonctionalités de Natural Language Processing, il s'agit de lemmentazing and stemming, on va travailler sur un DataSet des émails composé de deux columns:\n", - "\n", - "\n", - "1. Category: les types de messages soit (\"ham\"/\"spam\")\n", - "2. Message: message écrit sur l'email\n", - "\n", - "Après, on va essayer de prédire la catégorie de chaque message, à l'aide des algorithmes de classifications" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "JuNlg_yXpaCz", - "colab_type": "code", - "outputId": "c72bce71-5933-4f1f-bf68-e118a398e264", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 175 - } - }, - "source": [ - "# Installer les librairies nécessaires pour cette étude:\n", - "\n", - "!pip install nltk\n", - "\n", - "import nltk\n", - "\n", - "nltk.download('stopwords')\n", - "nltk.download('punkt')\n", - "nltk.download('wordnet')" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.12.0)\n", - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Unzipping corpora/stopwords.zip.\n", - "[nltk_data] Downloading package punkt to /root/nltk_data...\n", - "[nltk_data] Unzipping tokenizers/punkt.zip.\n", - "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", - "[nltk_data] Unzipping corpora/wordnet.zip.\n" - ], - "name": "stdout" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "True" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 6 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bY66z-NhpaC4", - "colab_type": "text" - }, - "source": [ - "# **1- Suppression des stopwords de tous les messages**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "z2qdjnkGpaC5", - "colab_type": "code", - "colab": {} - }, - "source": [ - "\n", - "from nltk.tokenize import word_tokenize\n", - "from nltk.corpus import stopwords\n", - "\n", - "stop_words = set(stopwords.words('english'))\n", - "# Assign the messages without stopwords to another column\n", - "spam['Message_without_stopwords'] = spam.Message.apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "p1nNJ83BpaC9", - "colab_type": "code", - "colab": {} - }, - "source": [ - "spam['Message_with_stopwords'] = spam.Message" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "jcErxwmxpaDz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "spam.drop(columns=['Message'],inplace=True)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YK2CNic4paDD", - "colab_type": "text" - }, - "source": [ - "# **2- Application du stemming sur les messages sans stopwords**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oZIBQesipaDE", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from nltk.stem import SnowballStemmer\n", - "from nltk.tokenize import word_tokenize\n", - "\n", - "stemmer = SnowballStemmer('english')\n", - "spam['Message_stemmed_without_StopWords'] = spam.Message_without_stopwords.apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "alk_IGb6paDL", - "colab_type": "text" - }, - "source": [ - "# **3- Application de la lematization sur les messages sans stopwords (mais pas après stemming)**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "rP2LlXsvpaDN", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from nltk.stem import WordNetLemmatizer \n", - "\n", - "lemmatizer = WordNetLemmatizer()\n", - "spam['Message_lemmatized_before_stemming'] = spam.Message_without_stopwords.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nYZjzxGjpaDT", - "colab_type": "text" - }, - "source": [ - "# **4- Comparation des résultats entre 2 et 3 (2-3 phrases suffisent)**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "aRuLdRTzpaDV", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# show the totality of message in our dataSet\n", - "pd.set_option('display.max_colwidth', -1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "y4FLI0AgpaDZ", - "colab_type": "code", - "outputId": "2b41d1ee-d5fb-4efc-ee7e-6e79f26d3779", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 511 - } - }, - "source": [ - "spam[['Message_without_stopwords','Message_stemmed_without_StopWords','Message_lemmatized_before_stemming']].head(10)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Message_without_stopwordsMessage_stemmed_without_StopWordsMessage_lemmatized_before_stemming
0Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ...go jurong point , crazy.. avail bugi n great world la e buffet ... cine got amor wat ...Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ...
1Ok lar ... Joking wif u oni ...ok lar ... joke wif u oni ...Ok lar ... Joking wif u oni ...
2Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 'sfree entri 2 wkli comp win fa cup final tkts 21st may 2005 . text fa 87121 receiv entri question ( std txt rate ) t & c 's appli 08452810075over18 'sFree entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's
3U dun say early hor ... U c already say ...u dun say earli hor ... u c alreadi say ...U dun say early hor ... U c already say ...
4Nah I n't think goes usf , lives around thoughnah i n't think goe usf , live around thoughNah I n't think go usf , life around though
5FreeMsg Hey darling 's 3 week 's word back ! I 'd like fun still ? Tb ok ! XxX std chgs send , £1.50 rcvfreemsg hey darl 's 3 week 's word back ! i 'd like fun still ? tb ok ! xxx std chgs send , £1.50 rcvFreeMsg Hey darling 's 3 week 's word back ! I 'd like fun still ? Tb ok ! XxX std chgs send , £1.50 rcv
6Even brother like speak . They treat like aids patent .even brother like speak . they treat like aid patent .Even brother like speak . They treat like aid patent .
7As per request 'Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . Press *9 copy friends Callertuneas per request mell mell ( oru minnaminungint nurungu vettam ) ' set callertun caller . press *9 copi friend callertunAs per request 'Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . Press *9 copy friend Callertune
8WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hours .winner ! ! as valu network custom select receivea £900 prize reward ! to claim call 09061701461 . claim code kl341 . valid 12 hour .WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hour .
9Had mobile 11 months ? U R entitled Update latest colour mobiles camera Free ! Call The Mobile Update Co FREE 08002986030had mobil 11 month ? u r entitl updat latest colour mobil camera free ! call the mobil updat co free 08002986030Had mobile 11 month ? U R entitled Update latest colour mobile camera Free ! Call The Mobile Update Co FREE 08002986030
\n", - "
" - ], - "text/plain": [ - " Message_without_stopwords ... Message_lemmatized_before_stemming\n", - "0 Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ... ... Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ... \n", - "1 Ok lar ... Joking wif u oni ... ... Ok lar ... Joking wif u oni ... \n", - "2 Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's ... Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's\n", - "3 U dun say early hor ... U c already say ... ... U dun say early hor ... U c already say ... \n", - "4 Nah I n't think goes usf , lives around though ... Nah I n't think go usf , life around though \n", - "5 FreeMsg Hey darling 's 3 week 's word back ! I 'd like fun still ? Tb ok ! XxX std chgs send , £1.50 rcv ... FreeMsg Hey darling 's 3 week 's word back ! I 'd like fun still ? Tb ok ! XxX std chgs send , £1.50 rcv \n", - "6 Even brother like speak . They treat like aids patent . ... Even brother like speak . They treat like aid patent . \n", - "7 As per request 'Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . Press *9 copy friends Callertune ... As per request 'Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . Press *9 copy friend Callertune \n", - "8 WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hours . ... WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hour . \n", - "9 Had mobile 11 months ? U R entitled Update latest colour mobiles camera Free ! Call The Mobile Update Co FREE 08002986030 ... Had mobile 11 month ? U R entitled Update latest colour mobile camera Free ! Call The Mobile Update Co FREE 08002986030 \n", - "\n", - "[10 rows x 3 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 13 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2ykAiHa_paDf", - "colab_type": "text" - }, - "source": [ - "If we look to phrase **'Had mobile 11 months ? U R entitled Update latest colour mobiles camera Free ! Call The Mobile Update Co FREE 08002986030'** specially verb *\"entitled\"* it is in the past simple, so normally when we stemmed we should have ***'entitle'*** but the result has ***'entitl'*** even the lemmatized he keeped the ***\"ed\"***. The same thing applies to message **'WINNER ! ! As valued network customer selected receivea £900 prize reward ! To claim call 09061701461 . Claim code KL341 . Valid 12 hours .\t\"** with the verbs \"select, value\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7aMTeMi-paDi", - "colab_type": "text" - }, - "source": [ - "# **5- Exécutez une TF-IDF sur les messages après Stemming ou Lemmatization**" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlGe8aRltBE0", - "colab_type": "text" - }, - "source": [ - "\n", - "\n", - "* **sublinear_df** est mis on True pour utiliser la forme logarithmique de la fréquence.\n", - "* **min_df** est le nombre minimum de documents dans lesquels un mot doit être présent pour être conservé.\n", - "* **norm is** est mis en \"l2\", pour garantir que tous nos vecteurs de caractéristiques ont une norme euclidienne de 1.\n", - "* **ngram_range** est réglé sur (1, 2) pour indiquer que nous voulons considérer à la fois les unigrammes (un mot) et les bigrammes (deux mots).\n", - "\n", - "Pour plus de détails, voir: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ioikwpoYxGCV", - "colab_type": "text" - }, - "source": [ - "## 5.1. Le cas du Lemmatizing" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Nqih3IykpaDj", - "colab_type": "code", - "outputId": "822a67f2-56d2-4a57-cf66-43347d01f360", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 402 - } - }, - "source": [ - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "\n", - "corpus_lemm = spam.Message_lemmatized_before_stemming\n", - "\n", - "#Initialisation du TfidfVectorizer\n", - "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))\n", - "\n", - "#transformation du message Lemmatizing\n", - "Y = vectorizer.fit_transform(corpus_lemm)\n", - "\n", - "# on crée une dataFrame pour avoir les mots les plus fréquents et son TFIDF (ce fréquence est en pourcentage)\n", - "df2 = pd.DataFrame(np.round(Y[0].T.todense() *100,2), index=vectorizer.get_feature_names(), columns=[\"tfidf % - lemmatize\"])\n", - "df2.sort_values(by=[\"tfidf % - lemmatize\"],ascending=False)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tfidf % - lemmatize
bugis36.84
la36.84
cine36.84
crazy33.77
available32.62
......
friends0.00
friendship0.00
friendship good0.00
frm0.00
zed0.00
\n", - "

2566 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " tfidf % - lemmatize\n", - "bugis 36.84 \n", - "la 36.84 \n", - "cine 36.84 \n", - "crazy 33.77 \n", - "available 32.62 \n", - "... ... \n", - "friends 0.00 \n", - "friendship 0.00 \n", - "friendship good 0.00 \n", - "frm 0.00 \n", - "zed 0.00 \n", - "\n", - "[2566 rows x 1 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 14 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jLAkFcY4xM52", - "colab_type": "text" - }, - "source": [ - "## 5.2. Le cas du Stemming" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "C3xTSLELpaDt", - "colab_type": "code", - "outputId": "53e31058-e233-468b-e580-bd3805c53488", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 402 - } - }, - "source": [ - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "corpus_stem = spam.Message_stemmed_without_StopWords\n", - "\n", - "#Initialisation du TfidfVectorizer\n", - "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))\n", - "\n", - "#transformation du message stemmé\n", - "X = vectorizer.fit_transform(corpus_stem)\n", - "\n", - "# on crée une dataFrame pour avoir les mots les plus fréquents et son TFIDF (ce fréquence est en pourcentage)\n", - "df1 = pd.DataFrame(np.round(X[0].T.todense() *100,2), index=vectorizer.get_feature_names(), columns=[\"tfidf % - stemmed\"])\n", - "df1.sort_values(by=[\"tfidf % - stemmed\"],ascending=False)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tfidf % - stemmed
la39.31
bugi39.31
cine39.31
avail35.38
point31.77
......
friend callertun0.00
friend repli0.00
friends0.00
friendship0.00
zed0.00
\n", - "

2534 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " tfidf % - stemmed\n", - "la 39.31 \n", - "bugi 39.31 \n", - "cine 39.31 \n", - "avail 35.38 \n", - "point 31.77 \n", - "... ... \n", - "friend callertun 0.00 \n", - "friend repli 0.00 \n", - "friends 0.00 \n", - "friendship 0.00 \n", - "zed 0.00 \n", - "\n", - "[2534 rows x 1 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 15 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_6tF5a_3u_5C", - "colab_type": "text" - }, - "source": [ - "# **6 (Bonus): Lancez une classification pour détecter les *spams***" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "LtzJ3lqhpaD-", - "colab_type": "code", - "outputId": "8cd7b995-8324-44fb-b612-ff9b0925ab21", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 70 - } - }, - "source": [ - "spam.Category.value_counts()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "ham 4825\n", - "spam 747 \n", - "Name: Category, dtype: int64" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 16 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yDR1J_lDx0sB", - "colab_type": "text" - }, - "source": [ - "On remarque le nombre de message par catégorie est déséquilibré, il est biasé sur le ham que le spam" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "S2v7WbAJyG0L", - "colab_type": "code", - "outputId": "befe447c-2f37-4edc-aa84-471830a9d314", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 422 - } - }, - "source": [ - "import matplotlib.pyplot as plt\n", - "spam.groupby('Category').Message_with_stopwords.count().plot.bar(ylim=0,figsize=(8,6))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 17 - }, - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAewAAAGECAYAAAD0jRNXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAVRklEQVR4nO3dfZBdd33f8c8XC3BaAjZYOFQyyAnu\ntIYASTTGTNJOgKlsMKnIQKgzKajEU/1R2gmdNsG0MC5PLQzT8JBpSFzsRmYA46ZQK8EEPOYhDcVg\nGYMfeBgUY8ZWASvIOBBqasO3f9yzZGMk78q+2stv9/Wa2dlzfufce393Rtr3nnvPnlvdHQDgR9uD\nFj0BAGBlgg0AAxBsABiAYAPAAAQbAAawadETuC8nnXRSb9u2bdHTAIA1c+211/5Fd2++9/iqgl1V\ntyT5VpLvJbmnu7dX1SOTvCfJtiS3JHlBd99RVZXkLUmeneQ7Sf5Zd396up9dSV4x3e1ru3vPfT3u\ntm3bsm/fvtVMEQDWhar6yuHGj+Yl8ad391O6e/u0fn6Sq7r7tCRXTetJ8qwkp01fu5O8bZrAI5Nc\nkOSpSc5IckFVnXi0TwQANqIH8h72ziRLR8h7kjx32fglPXN1khOq6jFJzkpyZXcf6u47klyZ5OwH\n8PgAsGGsNtid5ENVdW1V7Z7GTu7ur07LX0ty8rS8Jcmty2572zR2pHEAYAWrPensF7r7QFU9OsmV\nVfWF5Ru7u6tqLtc4nX4h2J0kj33sY+dxlwAwvFUdYXf3gen77Unel9l70F+fXurO9P32afcDSU5Z\ndvOt09iRxu/9WBd29/bu3r558w+dJAcAG9KKwa6qv11VP760nGRHkhuT7E2ya9ptV5LLp+W9SV5U\nM2cmuXN66fyDSXZU1YnTyWY7pjEAYAWreUn85CTvm/21VjYleVd3/0lVXZPksqo6L8lXkrxg2v+K\nzP6ka39mf9b14iTp7kNV9Zok10z7vbq7D83tmQDAOlY/yh+vuX379vZ32ABsJFV17bI/of4BlyYF\ngAEINgAMQLABYACCDQADEGwAGIBgA8AAfqQ/D3u923b++xc9BR6AW15/zqKnAGwgjrABYACCDQAD\nEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoAB\nCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AA\nBBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoABCDYADECwAWAA\ngg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACrDnZVHVdV11XVH0/rp1bVJ6tqf1W9\np6oeMo0/dFrfP23ftuw+Xj6Nf7Gqzpr3kwGA9epojrB/I8nnl62/IcmbuvvxSe5Ict40fl6SO6bx\nN037papOT3JukickOTvJ71bVcQ9s+gCwMawq2FW1Nck5Sd4+rVeSZyT5w2mXPUmeOy3vnNYzbX/m\ntP/OJJd293e7+8tJ9ic5Yx5PAgDWu9UeYb85yW8l+f60/qgk3+zue6b125JsmZa3JLk1Sabtd077\n/2D8MLf5garaXVX7qmrfwYMHj+KpAMD6tWKwq+o5SW7v7mvXYD7p7gu7e3t3b9+8efNaPCQA/Mjb\ntIp9fj7JP66qZyc5PsnDk7wlyQlVtWk6it6a5MC0/4EkpyS5rao2JXlEkm8sG1+y/DYAwH1Y8Qi7\nu1/e3Vu7e1tmJ419uLt/LclHkjx/2m1Xksun5b3TeqbtH+7unsbPnc4iPzXJaUk+NbdnAgDr2GqO\nsI/kZUkurarXJrkuyUXT+EVJ3lFV+5Mcyizy6e6bquqyJJ9Lck+Sl3T39x7A4wPAhnFUwe7ujyb5\n6LR8cw5zlnd335XkV45w+9cled3RThIANjpXOgOAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAG\nINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQAD\nEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoAB\nCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AA\nBBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGsGKwq+r4qvpUVX22qm6qqldN\n46dW1Seran9VvaeqHjKNP3Ra3z9t37bsvl4+jX+xqs46Vk8KANab1RxhfzfJM7r7yUmekuTsqjoz\nyRuSvKm7H5/kjiTnTfufl+SOafxN036pqtOTnJvkCUnOTvK7VXXcPJ8MAKxXKwa7Z749rT54+uok\nz0jyh9P4niTPnZZ3TuuZtj+zqmoav7S7v9vdX06yP8kZc3kWALDOreo97Ko6rqo+k+T2JFcm+fMk\n3+zue6ZdbkuyZVrekuTWJJm235nkUcvHD3Ob5Y+1u6r2VdW+gwcPHv0zAoB1aFXB7u7vdfdTkmzN\n7Kj47x2rCXX3hd29vbu3b968+Vg9DAAM5ajOEu/ubyb5SJKnJTmhqjZNm7YmOTAtH0hySpJM2x+R\n5BvLxw9zGwDgPqzmLPHNVXXCtPxjSf5Rks9nFu7nT7vtSnL5tLx3Ws+0/cPd3dP4udNZ5KcmOS3J\np+b1RABgPdu08i55TJI90xndD0pyWXf/cVV9LsmlVfXaJNcluWja/6Ik76iq/UkOZXZmeLr7pqq6\nLMnnktyT5CXd/b35Ph0AWJ9WDHZ3X5/kZw4zfnMOc5Z3d9+V5FeOcF+vS/K6o58mAGxsrnQGAAMQ\nbAAYgGADwAAEGwAGINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEI\nNgAMQLABYACCDQADEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAE\nGwAGINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACC\nDQADEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADB\nBoABCDYADECwAWAAKwa7qk6pqo9U1eeq6qaq+o1p/JFVdWVVfWn6fuI0XlX11qraX1XXV9XPLruv\nXdP+X6qqXcfuaQHA+rKaI+x7kvyb7j49yZlJXlJVpyc5P8lV3X1akqum9SR5VpLTpq/dSd6WzAKf\n5IIkT01yRpILliIPANy3FYPd3V/t7k9Py99K8vkkW5LsTLJn2m1PkudOyzuTXNIzVyc5oaoek+Ss\nJFd296HuviPJlUnOnuuzAYB16qjew66qbUl+Jsknk5zc3V+dNn0tycnT8pYkty672W3T2JHG7/0Y\nu6tqX1XtO3jw4NFMDwDWrVUHu6oeluR/JHlpd//l8m3d3Ul6HhPq7gu7e3t3b9+8efM87hIAhreq\nYFfVgzOL9Tu7+73T8Nenl7ozfb99Gj+Q5JRlN986jR1pHABYwWrOEq8kFyX5fHf/9rJNe5Msnem9\nK8nly8ZfNJ0tfmaSO6eXzj+YZEdVnTidbLZjGgMAVrBpFfv8fJIXJrmhqj4zjf27JK9PcllVnZfk\nK0leMG27Ismzk+xP8p0kL06S7j5UVa9Jcs2036u7+9BcngUArHMrBru7/yxJHWHzMw+zfyd5yRHu\n6+IkFx/NBAEAVzoDgCEINgAMQLABYACCDQADEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFg\nAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAw\nAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AABBsABiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAY\ngGADwAAEGwAGINgAMADBBoABCDYADECwAWAAgg0AAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAM\nQLABYACCDQADEGwAGIBgA8AABBsABrBisKvq4qq6vapuXDb2yKq6sqq+NH0/cRqvqnprVe2vquur\n6meX3WbXtP+XqmrXsXk6ALA+reYI+w+SnH2vsfOTXNXdpyW5alpPkmclOW362p3kbcks8EkuSPLU\nJGckuWAp8gDAylYMdnf/aZJD9xremWTPtLwnyXOXjV/SM1cnOaGqHpPkrCRXdveh7r4jyZX54V8C\nAIAjuL/vYZ/c3V+dlr+W5ORpeUuSW5ftd9s0dqTxH1JVu6tqX1XtO3jw4P2cHgCsLw/4pLPu7iQ9\nh7ks3d+F3b29u7dv3rx5XncLAEO7v8H++vRSd6bvt0/jB5Kcsmy/rdPYkcYBgFW4v8Hem2TpTO9d\nSS5fNv6i6WzxM5PcOb10/sEkO6rqxOlksx3TGACwCptW2qGq3p3kF5OcVFW3ZXa29+uTXFZV5yX5\nSpIXTLtfkeTZSfYn+U6SFydJdx+qqtckuWba79Xdfe8T2QCAI1gx2N39q0fY9MzD7NtJXnKE+7k4\nycVHNTsAIIkrnQHAEAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AABBsABrDipUkB\n1ptt579/0VPgAbjl9ecsegoL4QgbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AABBsA\nBiDYADAAwQaAAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoABCDYADECwAWAAgg0A\nAxBsABiAYAPAAAQbAAYg2AAwAMEGgAEINgAMQLABYACCDQADEGwAGIBgA8AABBsABiDYADAAwQaA\nAQg2AAxAsAFgAIINAAMQbAAYgGADwAAEGwAGINgAMADBBoABCDYADGDNg11VZ1fVF6tqf1Wdv9aP\nDwAjWtNgV9VxSf5LkmclOT3Jr1bV6Ws5BwAY0VofYZ+RZH9339zd/y/JpUl2rvEcAGA4ax3sLUlu\nXbZ+2zQGANyHTYuewL1V1e4ku6fVb1fVFxc5Hx6Qk5L8xaIncazUGxY9Azgi//fG9rjDDa51sA8k\nOWXZ+tZp7Ae6+8IkF67lpDg2qmpfd29f9Dxgo/F/b31a65fEr0lyWlWdWlUPSXJukr1rPAcAGM6a\nHmF39z1V9S+TfDDJcUku7u6b1nIOADCiNX8Pu7uvSHLFWj8uC+GtDVgM//fWoeruRc8BAFiBS5MC\nwAAEGwAGINgAMIAfuQunML6qelKSbVn276u737uwCQGsA4LNXFXVxUmelOSmJN+fhjuJYMMxNH24\n0jn54V+Wf3tRc2K+BJt5O7O7fQIbrL0/SnJXkhvy178ss44INvP2iao6vbs/t+iJwAaztbuftOhJ\ncOwINvN2SWbR/lqS7yapJO0HCRxzH6iqHd39oUVPhGNDsJm3i5K8MF6Wg7V2dZL3VdWDktydv/5l\n+eGLnRbz4kpnzFVVfaK7n7boecBGU1VfTrIzyQ3tB/u65Aibebuuqt6V2Qkw310a9GddcMzdmuRG\nsV6/BJt5+7HMQr1j2Zg/64Jj7+YkH62qD+Rv/rLsz7rWCcFmrrr7xYueA2xQX56+HjJ9sc54D5u5\nqqrjk5yX5AlJjl8a7+5fX9ikANYB1xJn3t6R5CeSnJXkY0m2JvnWQmcEG0BVba6qN1bVFVX14aWv\nRc+L+RFs5u3x3f3KJH/V3Xsyu1TiUxc8J9gI3pnkC0lOTfKqJLckuWaRE2K+BJt5u3v6/s2qemKS\nRyR59ALnAxvFo7r7oiR3d/fHprehnrHoSTE/Tjpj3i6sqhOTvCLJ3iQPS/LKxU4JNoSlX5a/WlXn\nJPk/SR65wPkwZ046Y66q6qFJnpfZJwY9eBru7n71wiYFG0BVPSfJ/0pySpLfSfLwJK/q7r0LnRhz\nI9jMVVX9SZI7k1yb5HtL4939nxc2KYB1QLCZq6q6sbufuOh5wEZTVT+Z5C1JnpbZdfw/keRfd/fN\nC50Yc+OkM+btf1fVTy96ErABvSvJZZn9WeXfSfLfk7x7oTNirhxhMxdVdUNmlyDdlOS0zC6T6OM1\nYY1U1fX3/n9WVZ/t7icvak7Ml2AzF1X1uPva3t1fWau5wEZUVW9IckeSSzP75fmfJDkxyRuTpLsP\nLW52zINgA6wD08drLln6wV5L6939k2s8JebMe9gA68PLkjy5u09N8t+SfDbJ87r7VLFeHwQbYH14\nRXf/ZVX9QmZXOHt7krcteE7MkWADrA9L1z04J8l/7e73x8dsriuCDbA+HKiq38/sZLMrpqsO+hm/\njjjpDGAdqKq/leTsJDd095eq6jFJfrq7P7TgqTEngg0AA/ByCQAMQLABYACCDYOrqp+oqkur6s+r\n6tqquqKq/u4R9j2hqv7FWs8ReOAEGwZWVZXkfUk+2t0/1d0/l+TlSU4+wk1OSHLMg11Vm471Y8BG\nI9gwtqcnubu7f29poLs/m+S6qrqqqj5dVTdU1c5p8+uT/FRVfaaq3pgkVfWbVXVNVV1fVa9aup+q\nemVVfbGq/qyq3l1V/3Yaf0pVXT3t/76qOnEa/2hVvbmq9iX591X15ap68LTt4cvXgaPnt2AY2xOT\nXHuY8buS/PJ05auTklxdVXuTnJ/kid39lCSpqh2ZfbraGZldd3pvVf3DJP83yfOSPDnJg5N8etnj\nXJLkX3X3x6rq1UkuSPLSadtDunv7dN/bMruIx/9Mcm6S93b33XN87rChCDasT5XkP07x/X6SLTn8\ny+Q7pq/rpvWHZRbwH09yeXffleSuqvqjJKmqRyQ5obs/Nu2/J7PPXV7ynmXLb0/yW5kF+8VJ/vkc\nnhdsWIINY7spyfMPM/5rSTYn+bnuvruqbkly/GH2qyT/qbt//28MVr30MPuuxl8tLXT3x6tqW1X9\nYpLjuvvG+3mfQLyHDaP7cJKHVtXupYGqelKSxyW5fYr106f1JPlWZkfPSz6Y5Ner6mHTbbdU1aOT\nfDzJL1XV8dO25yRJd9+Z5I6q+gfT7V+Y5GM5skuSvCuzT48CHgBH2DCw7u6q+uUkb66ql2X23vUt\nSf5DkrdW1Q1J9iX5wrT/N6rq41V1Y5IPdPdvVtXfT/KJ2Qnn+XaSf9rd10zveV+f5OtJbkhy5/Sw\nu5L83nQpzJsze7n7SN6Z5LVJ3j3Hpw0bkkuTAodVVQ/r7m9PYf7TJLu7+9NHeR/PT7Kzu194TCYJ\nG4gjbOBILqyq0zN773vP/Yj17yR5VpJnH4vJwUbjCBsABuCkMwAYgGADwAAEGwAGINgAMADBBoAB\n/H/++ODwA5LRyQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmxodxIbz-iq", - "colab_type": "text" - }, - "source": [ - "## 6.1. Le cas du stemming" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ywiY6StK0QIL", - "colab_type": "text" - }, - "source": [ - "On créer le column category_id pour assigner chaque catégorie un ID, et aussi supprimer les messages dupliqués" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "nro-kS8gpaER", - "colab_type": "code", - "colab": {} - }, - "source": [ - "spam['category_id'] = spam.Category.factorize()[0]\n", - "category_id_df = spam[['Category', 'category_id']].drop_duplicates().sort_values('category_id')\n", - "category_to_id = dict(category_id_df.values)\n", - "id_to_category = dict(category_id_df[['category_id', 'Category']].values)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GtFcat500quA", - "colab_type": "text" - }, - "source": [ - "Création de dataframe spécifique au message stémmer" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wXK2rIx_paEn", - "colab_type": "code", - "colab": {} - }, - "source": [ - "col = ['Category', 'Message_stemmed_without_StopWords','category_id']\n", - "df_stem = spam[col]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "jgBG8oKKpaEs", - "colab_type": "code", - "outputId": "be9b9fbe-ce97-4b42-fced-ff5e29a73029", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 212 - } - }, - "source": [ - "df_stem.head()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMessage_stemmed_without_StopWordscategory_id
0hamgo jurong point , crazy.. avail bugi n great world la e buffet ... cine got amor wat ...0
1hamok lar ... joke wif u oni ...0
2spamfree entri 2 wkli comp win fa cup final tkts 21st may 2005 . text fa 87121 receiv entri question ( std txt rate ) t & c 's appli 08452810075over18 's1
3hamu dun say earli hor ... u c alreadi say ...0
4hamnah i n't think goe usf , live around though0
\n", - "
" - ], - "text/plain": [ - " Category ... category_id\n", - "0 ham ... 0 \n", - "1 ham ... 0 \n", - "2 spam ... 1 \n", - "3 ham ... 0 \n", - "4 ham ... 0 \n", - "\n", - "[5 rows x 3 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 20 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZDmA1G6Wy7So", - "colab_type": "text" - }, - "source": [ - "Maintenant, chacun des 5572 messages stemmés est représenté par 2534 fonctionnalités, représentant le score tf-idf pour différents unigrammes et bigrammes" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "aOs3R6oCpaEM", - "colab_type": "code", - "outputId": "518bce1e-6708-41b2-e3bb-b5b5d73d7ff5", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "corpus_stem = spam.Message_stemmed_without_StopWords\n", - "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))\n", - "featuresStem = vectorizer.fit_transform(corpus_stem).toarray()\n", - "labels = df_stem.category_id\n", - "featuresStem.shape" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(5572, 2534)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 21 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BCDjJum102Di", - "colab_type": "text" - }, - "source": [ - "### 6.1.1. Classification à l'aide du Naive bayes" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9UmH7Jb-paE2", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from sklearn.model_selection import train_test_split\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.feature_extraction.text import TfidfTransformer\n", - "from sklearn.naive_bayes import MultinomialNB\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(df_stem['Message_stemmed_without_StopWords'], df_stem['Category'], random_state = 0)\n", - "count_vect = CountVectorizer()\n", - "X_train_counts = count_vect.fit_transform(X_train)\n", - "tfidf_transformer = TfidfTransformer()\n", - "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)\n", - "\n", - "clf = MultinomialNB().fit(X_train_tfidf, y_train)\n", - "y_pred = clf.predict(count_vect.transform(X_test))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "1izmZb2iw3Ov", - "colab_type": "code", - "outputId": "94c71c1d-2a39-4105-ebf6-f6a3cfbdc9ac", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 175 - } - }, - "source": [ - "from sklearn import metrics\n", - "print(metrics.classification_report(y_test, y_pred, target_names=df_stem['Category'].unique()))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " ham 0.98 1.00 0.99 1208\n", - " spam 0.99 0.89 0.94 185\n", - "\n", - " accuracy 0.98 1393\n", - " macro avg 0.99 0.94 0.96 1393\n", - "weighted avg 0.98 0.98 0.98 1393\n", - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "jgFWSxQ7paE6", - "colab_type": "code", - "outputId": "850cfa70-51b7-496f-8dd4-a930f3ff17ea", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "print(clf.predict(count_vect.transform([\"Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's\"])))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "['spam']\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "5myKRjX5paE_", - "colab_type": "code", - "outputId": "2e19fb36-0ea4-44d4-fd67-025bd16eb343", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "print(clf.predict(count_vect.transform([\"Nah I n't think go usf , life around though\"])))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "['ham']\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tZxOPEOP34LN", - "colab_type": "text" - }, - "source": [ - "On remarque, il y a 22 messages qui a été confusioner (21 spam, 1 ham)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "5_9AetF1paFG", - "colab_type": "code", - "outputId": "65985204-3f4d-47ff-9361-cd1201d9e956", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 388 - } - }, - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "conf_mat = confusion_matrix(y_test, y_pred)\n", - "fig, ax = plt.subplots(figsize=(8,6))\n", - "sns.heatmap(conf_mat, annot=True, fmt='d',\n", - " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", - "plt.ylabel('Actual')\n", - "plt.xlabel('Predicted')\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdkAAAFzCAYAAABywHOKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAc80lEQVR4nO3deZxcVbXo8d8iERIZA8iUIJNwkUHC\noCLqZVLmSXhX4CEi8gxX4TogKioyePHigPg+CA8JCASQSSESBASMiEEJY4CAoOYyXImBgAoqIAR6\nvT/qJFRihu6u7D6dU78vn/Op6l2nau/i07B67b3OPpGZSJKkxW+pugcgSVJTGWQlSSrEICtJUiEG\nWUmSCjHISpJUiEFWkqRChtY9gAWZ9eyjXlukJd7wtd5b9xCkxeLVV6ZHqc/u9P/3b1h1/WJj69Sg\nDbKSpC7R81rdIyjG6WJJkgoxk5Uk1St76h5BMQZZSVK9egyykiQVkQ3OZF2TlSSpEDNZSVK9nC6W\nJKmQBk8XG2QlSfVq8HWyBllJUr0anMla+CRJUiFmspKkeln4JElSGU2+TtYgK0mql5msJEmFNDiT\ntfBJkqRCzGQlSfXyOllJkgpp8HSxQVaSVK8GFz65JitJUiFmspKkejldLElSIQ2eLjbISpJqlWl1\nsSRJZTR4utjCJ0mSCjGTlSTVyzVZSZIKafB0sUFWklQvt1WUJKmQBmeyFj5JklSImawkqV4WPkmS\nVEiDp4sNspKkejU4k3VNVpKkQsxkJUn1anAma5CVJNXKGwRIklSKmawkSYU0uLrYwidJkgoxk5Uk\n1cvpYkmSCmnwdLFBVpJULzNZSZIKaXAma+GTJEmFmMlKkurldLEkSYUYZCVJKsQ1WUmS1FdmspKk\nejldLElSIQ2eLjbISpLq1eBM1jVZSVK9sqezYxEi4vyImBkRD7a1rRwRN0fE76vHEVV7RMQZETEt\nIh6IiK3a3nNYdf7vI+Kw3nw1g6wkqekuBHabp+04YGJmbghMrH4G2B3YsDrGAGdDKygDJwLvBN4B\nnDg7MC+MQVaSVK+ens6ORcjMXwJ/nqd5X2Bc9XwcsF9b+0XZMhlYKSLWBHYFbs7MP2fmX4Cb+efA\n/U9ck5Uk1aueNdnVM3NG9fwpYPXq+UjgD23nPVm1Lah9ocxkJUn1yuzoiIgxEXF32zGmb91nAlni\nq5nJSpLq1WEmm5ljgbF9fNvTEbFmZs6opoNnVu3TgbXbzhtVtU0Hdpin/ReL6sRMVpLUjSYAsyuE\nDwOuaWv/cFVlvC3wfDWtfCOwS0SMqAqedqnaFspMVpJUr8JrshFxGa0sdNWIeJJWlfDXgSsj4gjg\nCeCD1enXA3sA04AXgcMBMvPPEfGfwF3VeV/NzHmLqf6JQVaSVK/COz5l5sELeGnn+ZybwFEL+Jzz\ngfP70rdBVpJUL3d8kiRJfWUmK0mqVxa5emZQMMhKkurV4Olig6wkqV4GWUmSCmnw/WQtfJIkqRAz\nWUlSrbLHwidJkspwTVaSpEIavCZrkJUk1avB08UWPkmSVIiZrCSpXq7JSpJUiEFWkqRCGrx3sWuy\nkiQVYibbMMf/1+n88ld3svKIlfjxJd8D4LQzz+PWX93B0DcMZe2Ra3LKl45hheWXA+Dci67g6p/c\nyJClluKLn/k4737n1jz2xJMce8Kpcz7zyT/O4Oj/cyiHHviBWr6TtCDnjv02e+7xPmY+8yyjt/yn\n+29rSdHg6WIz2YbZb4/3873TT5mr7V1v35LxF3+P8Redzbprj+S8i68A4L8fe4IbJt7KNZd8j++d\nfgr/edqZvPbaa6y3ziiuGncWV407iyvPP4Nhw4ax8/bb1fF1pIW66KIr2XOvQ+oehjrVk50dg5hB\ntmG2Gb05K66w/Fxt737n1gwdOgSAt226MU/PfBaAn0+azO47b8/SSy/NqLXW4M2j1mLqw7+b672T\n776PtUeuyVprrD4wX0Dqg0m33cGf//Jc3cNQp7Kns2MQKz5dHBFvA9Zt7yszry7dr+Zv/HU3sdvO\n2wMw85k/8bbNNp7z2uqrrcrMZ56d6/wbJt7KHu/bfkDHKKnLDPJstBNFM9mIOB84HzgA2Ls69lrI\n+WMi4u6IuPu8iy4rObSudM64yxgyZAh77bJjr86fNWsWv7jtDnbZ6b2FRyZJzVQ6k902Mzfp7cmZ\nORYYCzDr2Ueb+6dNDX583c388ld3ct4ZpxIRAKz2plV46uln5pzz9MxnWe1Nq875edLku3nrRhuw\n6sojBny8krpHWvjUb7dHRK+DrMq4bfLdnH/pD/nuN05k+LBhc9p3fM+23DDxVl555RWe/ONT/M+T\nf2Tzt2405/Xrb/4Fe7x/hxpGLKmrNLjwqXQmexGtQPsU8DIQQGbm2wr327U+d+LXuWvKAzz33F/Z\neb8P8YkjDuW8i6/glVmz+Ninvwy0ip9O/Px/8Jb112HXnd7LPoccydAhQ/jyMZ9gyJBWgdSLL/2D\n2++awomf/2SdX0daqEsuPovt//VdrLrqyjz+6N2c/NXTuODCy+selvpqkBcvdSKy4E4bETENOAaY\nCsz5t5iZTyzqvU4XqwmGr+V6tprh1VemR6nPfuGUD3X0//tlj7+k2Ng6VTqTfSYzJxTuQ5K0JBvk\nU76dKB1kp0TEpcC1tKaLAS/hkSS1aXDhU+kgO5xWcN2lrS0Bg6wkqcVMtn8y8/CSny9JaoAGFz4V\nDbIRMQw4AtgUmHPtSGZ+tGS/kiQNBqWvk70YWAPYFbgVGAX8rXCfkqQlidfJ9ttbMvPfImLfzBxX\nFUFNKtynJGkJ0uQdn0oH2VnV43MRsRnwFLBa4T4lSUuSQZ6NdqJ0kB0bESOA44EJwHLAVwr3KUla\nkhhk++1iWnfgWRcYV7V5Y1JJUlcoHWSvAZ4H7qFtMwpJkubwEp5+G5WZuxXuQ5K0JHO6uN9+HRGb\nZ+bUwv1IkpZQaZDtm4iYSmv7xKHA4RHxKN7qTpLUZUplsnsV+lxJUtOYyfZNb+4XK0kS4F14JEkq\nxkxWkqRCGhxkS98gQJKkrmUmK0mqVWZzM1mDrCSpXg2eLjbISpLqZZCVJKmMJu/4ZOGTJEmFmMlK\nkurV4EzWICtJqldzN3wyyEqS6uWarCRJ6jMzWUlSvRqcyRpkJUn1ck1WkqQymrwma5CVJNWrwZms\nhU+SJBViJitJqlWTp4vNZCVJ9erp8FiEiPiXiLiv7fhrRHw6Ik6KiOlt7Xu0veeLETEtIn4bEbv2\n96uZyUqSapWF12Qz87fAaICIGAJMB8YDhwPfyczT2s+PiE2Ag4BNgbWAn0XERpn5Wl/7NpOVJNWr\ncCY7j52B/87MJxZyzr7A5Zn5cmY+BkwD3tHnnjDISpKWcBExJiLubjvGLOT0g4DL2n4+OiIeiIjz\nI2JE1TYS+EPbOU9WbX1mkJUk1Sp7Ojwyx2bmNm3H2Pn1ExFLA/sAP6yazgY2oDWVPAP49uL+bq7J\nSpLqNXDXye4O3JuZTwPMfgSIiHOBn1Q/TgfWbnvfqKqtz8xkJUm16jST7YODaZsqjog12177APBg\n9XwCcFBELBMR6wEbAnf257uZyUqSGi8ilgXeDxzZ1vzNiBgNJPD47Ncy86GIuBL4DfAqcFR/KovB\nICtJqlnpS3gAMvMFYJV52g5dyPlfA77Wab8GWUlSrQYiyNbFICtJqldG3SMoxiArSapVkzNZq4sl\nSSrETFaSVKvscbpYkqQimjxdbJCVJNUqLXySJKmMJmeyFj5JklSImawkqVYWPkmSVEhm3SMoxyAr\nSapVkzNZ12QlSSrETFaSVKsmZ7IGWUlSrVyTlSSpEDNZSZIKafKOTxY+SZJUiJmsJKlWTd5W0SAr\nSapVT4Oniw2ykqRaNXlNdoFBNiKuBRZYWJ2Z+xQZkSSpq3RrdfFpAzYKSZIaaIFBNjNvHciBSJK6\nU1dvRhERGwKnApsAw2a3Z+b6BcclSeoS3TpdPNsFwInAd4AdgcPx+lpJ0mLS5Ori3gTL4Zk5EYjM\nfCIzTwL2LDssSZKWfL3JZF+OiKWA30fE0cB0YLmyw5IkdYsmX8LTm0z2U8AbgU8CWwOHAoeVHJQk\nqXtkdnYMZovMZDPzrurp32mtx0qStNg0eU22N9XFtzCfTSkyc6ciI5IkdZUmTxf3Zk322Lbnw4AD\ngFfLDEeSpObozXTxPfM0/Soi7iw0HklSlxns66qd6M108cptPy5Fq/hpxWIjqiw/aofSXUjFjV7F\nPVukRenqNVngHlprskFrmvgx4IiSg5IkdY9uX5N9a2b+o70hIpYpNB5JUpdpcibbm+tkfz2fttsX\n90AkSWqahd1Pdg1gJDA8IrakNV0MsAKtzSkkSepYg+ueFjpdvCvwEWAU8G1eD7J/Bb5UdliSpG7R\n5Onihd1PdhwwLiIOyMyrBnBMkqQu0uTCp96syW4dESvN/iEiRkTEKQXHJElSI/QmyO6emc/N/iEz\n/wLsUW5IkqRu0tPhMZj15hKeIRGxTGa+DBARwwEv4ZEkLRZJc6eLexNkfwBMjIgLaBU/fQQYV3JQ\nkqTu0dPg8uLe7F38jYi4H3gfrUrrG4F1Sg9MktQdehqcyfZmTRbgaVoB9t+AnYCHi41IkqSGWNhm\nFBsBB1fHs8AVQGTmjgM0NklSF+jWNdlHgEnAXpk5DSAiPjMgo5IkdY3BXiHciYVNF+8PzABuiYhz\nI2JnaPCfG5KkWiTR0TGYLTDIZuaPM/MgYGPgFuDTwGoRcXZE7DJQA5QkaUm1yMKnzHwhMy/NzL1p\n7WM8BfhC8ZFJkrpCt29GMUe129PY6pAkqWODPVB2ok9BVpKkxW2wr6t2wiArSapVT3NjbK83o5Ak\nSX1kJitJqlWTt1U0yEqSatXg+wMYZCVJ9WpydbFrspKkWvVEdHT0RkQ8HhFTI+K+iLi7als5Im6O\niN9XjyOq9oiIMyJiWkQ8EBFb9fe7GWQlSd1ix8wcnZnbVD8fB0zMzA2BidXPALsDG1bHGODs/nZo\nkJUk1So7PDqwLzCuej4O2K+t/aJsmQysFBFr9qcDg6wkqVYDtK1iAjdFxD0RMaZqWz0zZ1TPnwJW\nr56PBP7Q9t4nq7Y+s/BJklSrTjejqILmmLamsZk57/a/78nM6RGxGnBzRDzS/mJmZkQs9kJng6wk\naYlWBdSF7qmfmdOrx5kRMR54B/B0RKyZmTOq6eCZ1enTgbXb3j6qauszp4slSbXqITo6FiUilo2I\n5Wc/B3YBHgQmAIdVpx0GXFM9nwB8uKoy3hZ4vm1auU/MZCVJtRqAzShWB8ZH63KfocClmfnTiLgL\nuDIijgCeAD5YnX89sAcwDXgROLy/HRtkJUm1Kn2DgMx8FNhiPu1/AnaeT3sCRy2Ovg2ykqRaueOT\nJEnqMzNZSVKtvEGAJEmFNPmm7QZZSVKtmrwma5CVJNWqyUHWwidJkgoxk5Uk1Spdk5UkqYwmTxcb\nZCVJtWpykHVNVpKkQsxkJUm1cjMKSZIKcTMKSZIKafKarEFWklSrJgdZC58kSSrETFaSVCsLnyRJ\nKsTCJ0mSCmnymqxBVpJUqyZPF1v4JElSIWaykqRa9TQ4lzXISpJq5ZqsJEmFNDePdU1WkqRizGQl\nSbVyuliSpELcjEKSpEKsLpYkqZDmhlgLnyRJKsZMVpJUKwufJEkqxDVZSZIKaW6INchKkmrW5Oli\nC58kSSrETFaSVCvXZCVJKqS5IdYgK0mqmWuykiSpz8xkJUm1ygZPGBtkJUm1avJ0sUFWklQrq4sl\nSSqkuSHWwidJkooxyHaJUaPW5MYbL2fKlInce+/POOqojwKw//57cu+9P+PFFx9nq63eVvMopfk7\n4fTjuGnqBK64Zdxc7Qd+9AB+NOkSrvjFRXzy+I/P9drqI1fjl9Nu5EP/ftBADlX90EN2dAxmThd3\niVdffY0vfOEU7rvvQZZbblluv/06Jk6cxEMP/ZYDDxzDWWedWvcQpQW69sobuOKCq/nqGV+e07b1\ndlvyr7u+h4N3PpxZr8xixCorzfWeY076D3798zsGeqjqBwuftMR76qmZPPXUTAD+/vcXeOSRaYwc\nuQYTJ06qeWTSok2ZfD9rjlpjrrb/ddh+jDvzEma9MguAv/zpuTmvbb/be5n+PzP4x4svDeg41T9N\nvoSn6HRxRAyJiH0i4pMRcczso2SfWrR11hnF6NGbcuedU+oeitRvb15/bUa/cwsuvO4czrn6u2yy\nxcYADH/jcA476n9z7rcvqHmE6q2eDo/BrHQmey3wD2Aqvfh3ERFjgDEAQ4eOYMiQ5cqOrgstu+wb\nueyyczj22JP529/+XvdwpH4bOnQIK660Ah/Z80g2Hf1WTh17Mvu+80DGHHs4l469kpfMYjUIlA6y\nozKz19U0mTkWGAswbNibmzt/UJOhQ4dy+eXncPnl47nmmp/WPRypI0/PeIafX38rAA/d9zDZk6y0\nykpsttUm7LzXDnzyKx9n+RWWo6cneeXlV7jygqtrHrEWpMnTxaWD7A0RsUtm3lS4H/XCOed8i0ce\nmcYZZ5xX91Ckjt3600ls8+6tuOfXU3jz+msz9A1Dee5Pz/Gx/Y6ec86Yzx7Oiy+8ZIAd5Ab7lG8n\nSgfZycD4iFgKmAUEkJm5QuF+NY/ttns7hxxyAFOnPswdd9wAwAknfJNlllma00//Km9608qMH38B\nDzzwG/be+9CaRyvN7Wv/70S23m5LVlp5Ra675yrGnnY+11x2HSd854tcccs4Zs16lZM+9V91D1P9\n1JPNzWQjC365iHgM2BeYmn3syOliNcFmI9apewjSYnH3jElR6rMPXWf/jv5/f/ETVxcbW6dKZ7J/\nAB7sa4CVJHWPJgeI0kH2UeAXEXED8PLsxsw8vXC/kqQlxGDftakTpYPsY9WxdHVIkjQXq4v7KTNP\nLvn5kqQln9XF/RQRbwI+D2wKDJvdnpk7lexXkqTBoPRdeH4APAKsB5wMPA7cVbhPSdISpMl34Skd\nZFfJzO8DszLz1sz8KGAWK0maIzv8Z1EiYu2IuCUifhMRD0XEp6r2kyJiekTcVx17tL3nixExLSJ+\nGxG79ve7lS58mlU9zoiIPYE/AisX7lOStAQZgDXZV4HPZua9EbE8cE9E3Fy99p3MPK395IjYBDiI\n1lLnWsDPImKjzHytrx2XDrKnRMSKwGeB7wIrAJ8p3KckaQlSeiuFzJwBzKie/y0iHgZGLuQt+wKX\nZ+bLwGMRMQ14B3B7X/suOl2cmT/JzOcz88HM3DEzt87MCSX7lCR1l4gYExF3tx1jFnLuusCWwB1V\n09ER8UBEnB8RI6q2kbQ2U5rtSRYelBeo9P1k14+IayPi2YiYGRHXRMT6JfuUJC1ZOi18ysyxmblN\n2zF2fv1ExHLAVcCnM/OvwNnABsBoWpnutxf3dytd+HQpcCWwBq157R8ClxXuU5K0BBmIm7ZHxBto\nBdgfZObVAJn5dGa+lpk9wLm0poQBpgNrt719VNXWZ6WD7Bsz8+LMfLU6LqHtellJkgagujiA7wMP\nt2/rGxFrtp32AeDB6vkE4KCIWCYi1gM2BO7sz3cbiPvJHgdcTmsP6AOB6yNiZYDM/HPh/iVJejdw\nKDA1Iu6r2r4EHBwRo2nFp8eBIwEy86GIuBL4Da3K5KP6U1kM5YPsB6vHI3n9RgtBqzQ6AddnJanL\nld5QIjNvoxV75nX9Qt7zNeBrnfZderr4C8AWmbkecAFwP3BAZq6XmQZYSRKZ2dExmJUOssdn5l8j\n4j20dno6j1Y1lyRJwMAUPtWldJCdPYe9J3BuZl6Ht7yTJLUpXfhUp9JBdnpEnMPrBU/LDECfkiQN\nCqUD3geBG4FdM/M5WvsWf65wn5KkJUiT78JT+qbtLwJXt/08Z/9ISZKg/N7FdSp9CY8kSQs12LPR\nTrg+KklSIWaykqRaDfYK4U4YZCVJtepxTVaSpDKaG2INspKkmln4JEmS+sxMVpJUqyZnsgZZSVKt\n3IxCkqRCzGQlSSqkydfJWvgkSVIhZrKSpFq5JitJUiGuyUqSVEiTM1nXZCVJKsRMVpJUK6eLJUkq\npMmX8BhkJUm18lZ3kiQV0uRM1sInSZIKMZOVJNXK6WJJkgpp8nSxQVaSVCszWUmSCmlyJmvhkyRJ\nhZjJSpJq5XSxJEmFNHm62CArSapVZk/dQyjGNVlJkgoxk5Uk1cq78EiSVEiTb9pukJUk1cpMVpKk\nQpqcyVr4JElSIWaykqRauRmFJEmFuBmFJEmFNHlN1iArSapVk6uLLXySJKkQM1lJUq2cLpYkqRCr\niyVJKqTJmaxrspIkFWImK0mqVZOriw2ykqRaNXm62CArSaqVhU+SJBXS5G0VLXySJKkQM1lJUq2c\nLpYkqRALnyRJKqTJa7IGWUlSrZqcyVr4JElSIWaykqRaNTmTNchKkmrV3BAL0eS/ILRwETEmM8fW\nPQ6pU/4ua7ByTba7jal7ANJi4u+yBiWDrCRJhRhkJUkqxCDb3VzDUlP4u6xBycInSZIKMZOVJKkQ\ng2wDRcS6EfFg3eOQpG5nkJUkqRCDbHMNiYhzI+KhiLgpIoZHxMci4q6IuD8iroqINwJExIURcXZE\nTI6IRyNih4g4PyIejogLa/4e6jIRsWxEXFf9nj4YEQdGxOMR8c2ImBoRd0bEW6pz946IOyJiSkT8\nLCJWr9pPiohxETEpIp6IiP3b3v/TiHhDvd9S3cIg21wbAmdl5qbAc8ABwNWZ+fbM3AJ4GDii7fwR\nwLuAzwATgO8AmwKbR8ToAR25ut1uwB8zc4vM3Az4adX+fGZuDpwJ/N+q7TZg28zcErgc+Hzb52wA\n7ATsA1wC3FK9/yVgz/JfQzLINtljmXlf9fweYF1gs+ov+6nAIbSC6GzXZqvUfCrwdGZOzcwe4KHq\nvdJAmQq8PyK+ERHvzcznq/bL2h7fVT0fBdxY/U5/jrl/p2/IzFnV5w3h9WA9FX+nNUAMss31ctvz\n12jdDOJC4Ojqr/mTgWHzOb9nnvf24I0kNIAy83fAVrSC4SkRccLsl9pPqx6/C5xZ/U4fyXx+p6s/\nFmfl69cr+jutAWOQ7S7LAzOq9ahD6h6MND8RsRbwYmZeAnyLVsAFOLDt8fbq+YrA9Or5YQM2SKmX\n/Guuu3wFuAN4pnpcvt7hSPO1OfCtiOgBZgEfB34EjIiIB2hlqAdX554E/DAi/gL8HFhv4IcrLZg7\nPkka9CLicWCbzHy27rFIfeF0sSRJhZjJSpJUiJmsJEmFGGQlSSrEICtJUiEGWQmIiNci4r5qr9wf\nzt7XuZ+ftUNE/KR6vk9EHLeQc1eKiE/0o4+TIuLY/o5R0sAwyEotL2Xm6Gqv3FeAf29/MVr6/N9L\nZk7IzK8v5JSVgD4HWUlLBoOs9M8mAW+p7sv724i4CHgQWDsidomI2yPi3irjXQ4gInaLiEci4l5g\n/9kfFBEfiYgzq+erR8T46u4y90fEdsDXgQ2qLPpb1Xmfq+6W9EBEnNz2WV+OiN9FxG3AvwzYvw1J\n/eaOT1KbiBgK7M7rm8lvCByWmZMjYlXgeOB9mflCRHwBOCYivgmcS+uOL9OAKxbw8WcAt2bmByJi\nCLAccBywWWaOrvrfperzHUAAEyLiX4EXgIOA0bT+u72X1o0fJA1iBlmpZXhEzL5r0STg+8BawBOZ\nOblq3xbYBPhVRAAsTWsP3Y1p3fXo9wARcQkwZj597AR8GCAzXwOej4gR85yzS3VMqX5ejlbQXR4Y\nn5kvVn1M6OjbShoQBlmp5aXZ2eRsVSB9ob0JuDkzD57nvMV5v90ATs3Mc+bp49OLsQ9JA8Q1Wan3\nJgPvjoi3AETEshGxEfAIsG5EbFCdd/AC3j+R1mb3RMSQiFgR+Btz36jhRuCjbWu9IyNiNeCXwH4R\nMTwilgf2XszfTVIBBlmplzLzGeAjwGXV3WBuBzbOzH/Qmh6+rip8mrmAj/gUsGN1g/F7gE0y80+0\npp8fjIhvZeZNwKXA7dV5PwKWz8x7aa313g/cANxV7ItKWmzcu1iSpELMZCVJKsQgK0lSIQZZSZIK\nMchKklSIQVaSpEIMspIkFWKQlSSpEIOsJEmF/H/bcy7eOnS8CQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YnZZwbEf4IIj", - "colab_type": "text" - }, - "source": [ - "### 6.1.2. Classification par Linear SVC (svm)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "gJs3Jk27paFQ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from sklearn.model_selection import train_test_split\n", - "from sklearn.svm import LinearSVC\n", - "\n", - "model = LinearSVC()\n", - "\n", - "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(featuresStem, labels, df_stem.index, test_size=0.33, random_state=0)\n", - "model.fit(X_train, y_train)\n", - "y_pred = model.predict(X_test)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BZRcNRos4g4u", - "colab_type": "text" - }, - "source": [ - "On remarque le f1-score du LinearSVC est améliorer comparant au Naive Bayes" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "UkErQi5lpaFX", - "colab_type": "code", - "outputId": "83f017c9-fe35-4ec1-ddd5-dfb57307507c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 175 - } - }, - "source": [ - "from sklearn import metrics\n", - "print(metrics.classification_report(y_test, y_pred, target_names=df_stem['Category'].unique()))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " ham 0.99 1.00 0.99 1597\n", - " spam 0.99 0.93 0.96 242\n", - "\n", - " accuracy 0.99 1839\n", - " macro avg 0.99 0.96 0.98 1839\n", - "weighted avg 0.99 0.99 0.99 1839\n", - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DOVdvCHYpaFd", - "colab_type": "code", - "outputId": "fe3710f3-98f7-4fb5-88fc-088db9cc4b88", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 388 - } - }, - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "conf_mat = confusion_matrix(y_test, y_pred)\n", - "fig, ax = plt.subplots(figsize=(8,6))\n", - "sns.heatmap(conf_mat, annot=True, fmt='d',\n", - " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", - "plt.ylabel('Actuel')\n", - "plt.xlabel('Prédit')\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdkAAAFzCAYAAABywHOKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAee0lEQVR4nO3debhd093A8e8vSUXMY2OIlxSdaNHB\nQ1tqqNY8t1Qp6mlQQ1s19UXRepWi6i3Vxhjz3AotqqpoEaFUxFARVDSoVmiNubm/94+zE5c3wx2y\n7r7Z9/vx7Ofss84+Z6/jOU9+97fWb68dmYkkSZr7BtTdAUmSmsogK0lSIQZZSZIKMchKklSIQVaS\npEIMspIkFTKo7g7MytQXJ3ptkeZ5Q5Zbr+4uSHNF21vPRqnP7um/9+9Z6n3F+tZTfTbISpL6ifZp\ndfegGIeLJUkqxExWklSvbK+7B8UYZCVJ9Wo3yEqSVEQ2OJN1TlaSpELMZCVJ9XK4WJKkQho8XGyQ\nlSTVq8HXyRpkJUn1anAma+GTJEmFmMlKkupl4ZMkSWU0+TpZg6wkqV5mspIkFdLgTNbCJ0lSo0XE\nuRHxQkQ81KHtmIh4NiIeqLbNO7z23YiYEBGPRcQXOrRvWrVNiIjDO3NuM1lJUr3KXyd7PnA6cMG7\n2k/NzJM7NkTEh4GdgdWA5YDfRcT7q5fPADYBJgFjI2J0Zj48uxMbZCVJ9So8XJyZt0fESp08fBvg\nssx8E3gyIiYAa1evTcjMiQARcVl17GyDrMPFkqR6tbf3aIuIERFxb4dtRCfPvH9EPFgNJy9etS0P\nPNPhmElV26zaZ8sgK0map2XmyMz8RIdtZCfediawMrAmMBk4pUTfHC6WJNWrhurizHx++n5EnAVc\nXz19Flihw6HDqjZm0z5LZrKSpHr1cLi4OyJi2Q5PtwOmVx6PBnaOiMERMRxYFbgHGAusGhHDI2I+\nWsVRo+d0HjNZSVKtMstWF0fEpcAGwFIRMQk4GtggItYEEngK2LvVlxwfEVfQKmhqA/bLqoMRsT9w\nEzAQODczx8/x3Jk517/Q3DD1xYl9s2NSFwxZbr26uyDNFW1vPRulPvuNB67v0b/386+5ZbG+9ZTD\nxZIkFeJwsSSpXq5dLElSIQ1eu9ggK0mqV/llFWtjkJUk1avBmayFT5IkFWImK0mql4VPkiQV0uDh\nYoOsJKleDc5knZOVJKkQM1lJUr0anMkaZCVJtSp9g4A6GWQlSfUyk5UkqZAGVxdb+CRJUiFmspKk\nejlcLElSIQ0eLjbISpLqZSYrSVIhDc5kLXySJKkQM1lJUr0cLpYkqRCDrCRJhTgnK0mSuspMVpJU\nL4eLJUkqpMHDxQZZSVK9zGQlSSqkwZmshU+SJBViJitJqpfDxZIkFWKQlSSpkMy6e1CMQVaSVK8G\nZ7IWPkmSVIiZrCSpXg3OZA2ykqR6Nfg6WYOsJKleDc5knZOVJKkQM1lJUr28hEeSpEIaPFxskJUk\n1csgK0lSIQ2uLrbwSZKkQsxkJUm1ynYLnyRJKsM5WUmSCmnwnKxBVpJUrwYPF1v4JElSIWaykqR6\nOScrSVIhBllJkgpp8NrFzslKklSImWzDHHn8j7n9T/ewxOKL8auLfg7AGedcxNWjb2TxxRYF4Jt7\n7876n1qbqVOncuyPfsr4Rx8nBgSHf3Mf1v7YRwHYY/9DefHFfzF48GAARv7kf1hy8cXq+VLSLAwb\nthznn3sa7x26FJnJ2WdfzE9PP6fubqmrCg8XR8S5wJbAC5m5etV2ErAV8BbwBLBnZk6pXvsusBcw\nDTgwM2+q2jcFTgMGAmdn5glzOrdBtmG23XwTdtlha/77Bye/o323nbZlz112fEfbVaNvBOCXF57J\nP1+awr7fOYrLzj6NAQNaAxwnHH0oq3/o/b3Tcakb2traOOTQY7n/gYdYaKEFuWfMjfzultt55JHH\n6+6auqL8JTznA6cDF3Rouxn4bma2RcSJwHeBwyLiw8DOwGrAcsDvImL6P4RnAJsAk4CxETE6Mx+e\n3YkdLm6YT6z5ERZdZOFOHfvEU39j7Y+vAcCSiy/GwgstyPhH/cdJ847nnnuB+x94CID//OdVHn30\ncZZfbpmae6Uuy/aebXP6+MzbgX+9q+23mdlWPb0bGFbtbwNclplvZuaTwARg7WqbkJkTM/Mt4LLq\n2NkqHmQj4qMRsXVEbD99K31O/X+XXn0d2311X448/se8/Mq/AfjAKsP5wx/vpq1tGpP+/hwPPzaB\n557/x4z3HHX8qeyw+378/LxLyAYXJqgZVlxxGGuusTpj7rm/7q6oq9qzZ1vPfQ24odpfHnimw2uT\nqrZZtc9W0SBbjYOfC+xAa+x7K1rj4rM6fkRE3BsR9559waUlu9av7LTdFtxwxblcff4ZLL3kEpx0\n+lkAbLfFFxi69FLstNeBnHjaL1hz9Q8xYGDrJ3Hi0YfyywvP5IKfncR9f3mI0TfeUudXkGZrwQUX\n4IrLz+Kgg4/m3//+T93dUS/rGDuqbUQX3nsE0AZcXKJvpedk18nMD3f24MwcCYwEmPriRFOnuWSp\nJRafsb/j1pux3yFHAzBo0EAO++beM177yt4HsdIKrT/Mhi69FND6x2uLTTbkoYf/yjabfa4Xey11\nzqBBg7jy8rO49NJf8qtf3TDnN6jPyR4WPnWMHV0REXvQSvw2zreH654FVuhw2LCqjdm0z1Lp4eK7\nqklk1egfL749FXHLbXeyyvtWBOD1N97gtdffAODOe/7MoIEDWXn4irS1TeOlKS8DMLWtjdvuHDPj\nPVJfc9bIU3jk0Qn85LQu/xurvqKG4eKqUvhQYOvMfK3DS6OBnSNicEQMB1YF7gHGAqtGxPCImI9W\ncdToOZ2ndCZ7Aa1A+xzwJhBAZuZHC5+33zrk6BMYe/+DTJnyChtvuyvf2Gs3xt7/II89PhECll9m\nKEcfeiAA/3rpZfb+9hHEgAEMXXpJfvi9gwF4a+pU9j7oSKa2tdE+rZ11PrkWO269aZ1fS5qpT3/q\nk+y26448OO5h7h37WwCOOuoEbrjx9zX3TF1S+C48EXEpsAGwVERMAo6mVU08GLg5IgDuzsx9MnN8\nRFwBPExrGHm/zJxWfc7+wE20LuE5NzPHz/HcJQtaImICcBAwDpjxfzEzn57Tex0uVhMMWW69ursg\nzRVtbz0bpT771eN27dG/9wseeVGxvvVU6Uz2H5k5x3RaktSPNfhWd6WD7P0RcQlwHa3hYgAy85rC\n55UkzSu8QUC3DaEVXD/foS0Bg6wkqcVMtnsyc8+Sny9JaoDChU91KhpkI2J+WossrwbMP709M79W\n8rySJPUFpa+TvRBYBvgCcButi3f/XfickqR5Sf3LKhZTek52lcz8YkRsk5mjqiKoOwqfU5I0D+np\nik99WekgO7V6nBIRqwPPAe8tfE5J0rykj2ejPVE6yI6MiMWBI2ktP7UQcFThc0qS5iUG2W67kNYd\neFYCRlVtQwufU5KkPqF0kL0WeBm4jw6LUUiSNIOX8HTbsMx0ZXlJ0qw5XNxtd0bERzJzXOHzSJLm\nUWmQ7ZqIGEdr+cRBwJ4RMRFvdSdJ6mdKZbJbFvpcSVLTmMl2TWfuFytJEuBdeCRJKsZMVpKkQhoc\nZEvfIECSpH7LTFaSVKvM5mayBllJUr0aPFxskJUk1csgK0lSGU1e8cnCJ0mSCjGTlSTVq8GZrEFW\nklSv5i74ZJCVJNXLOVlJktRlZrKSpHo1OJM1yEqS6uWcrCRJZTR5TtYgK0mqV4MzWQufJEkqxExW\nklQrh4slSSqlwcPFBllJUq3SICtJUiENDrIWPkmSVIiZrCSpVg4XS5JUikFWkqQympzJOicrSVIh\nZrKSpFo1OZM1yEqSamWQlSSplIy6e1CMQVaSVKsmZ7IWPkmSVIiZrCSpVtnucLEkSUU0ebjYICtJ\nqlVa+CRJUhlNzmQtfJIkqRCDrCSpVtkePdo6IyK+GREPRcT4iPhW1bZERNwcEY9Xj4tX7RER/xsR\nEyLiwYj4WHe/m0FWklSrzJ5tcxIRqwNfB9YG1gC2jIhVgMOBWzJzVeCW6jnAZsCq1TYCOLO7380g\nK0mqVS9ksh8CxmTma5nZBtwGbA9sA4yqjhkFbFvtbwNckC13A4tFxLLd+W4GWUlS0z0ErBcRS0bE\nAsDmwArA0MycXB3zHDC02l8eeKbD+ydVbV1mdbEkqVY9XYwiIkbQGtadbmRmjpzx+ZmPRMSJwG+B\nV4EHgGnv6ENmRkQnBp+7xiArSapVZ+ZVZ//+HAmMnMMx5wDnAETE8bSy0+cjYtnMnFwNB79QHf4s\nrUx3umFVW5c5XCxJqlUvVRe/t3r8L1rzsZcAo4Hdq0N2B66t9kcDX62qjNcBXu4wrNwlZrKSpFr1\n0opPV0fEksBUYL/MnBIRJwBXRMRewNPAl6pjf0Nr3nYC8BqwZ3dPapCVJDVeZq43k7Z/AhvPpD2B\n/ebGeQ2ykqRaNXlZRYOsJKlW7d4gQJKkMvrlXXgi4jpgloXVmbl1kR5JkvqV/nrT9pN7rReSJDXQ\nLINsZt42fT8ihgD/lZmP9UqvJEn9Rk8Xo+jL5rgYRURsRWsJqhur52tGxOjSHZMk9Q+9sRhFXTpT\n+HQMrdsD/QEgMx+IiOEF+yRJ6keaXF3cmWUVp2bmy+9qa3ByL0nS3NGZTHZ8ROwCDIyIVYEDgTvL\ndkuS1F80+RKezmSyBwCrAW8ClwKvAN8q2SlJUv+R2bOtL5tjJpuZrwFHVJskSXNVk+dk5xhkI+JW\nZjIHm5kbFemRJKlfafJwcWfmZA/usD8/sAPQVqY7kiQ1R2eGi+97V9OfIuKeQv2RJPUzfX1etSc6\nM1y8RIenA4CPA4sW61FlweXXL30Kqbi1llq57i5IfV6/npMF7qM1Jxu0homfBPYq2SlJUv/R3+dk\nP5SZb3RsiIjBhfojSepnmpzJduY62ZktPHHX3O6IJElNM7v7yS4DLA8MiYi1aA0XAywCLNALfZMk\n9QMNrnua7XDxF4A9gGHAKbwdZF8B/rtstyRJ/UWTh4tndz/ZUcCoiNghM6/uxT5JkvqRJhc+dWZO\n9uMRsdj0JxGxeEQcV7BPkiQ1QmeC7GaZOWX6k8x8Cdi8XJckSf1Jew+3vqwzl/AMjIjBmfkmQEQM\nAbyER5I0VyTNHS7uTJC9GLglIs6jVfy0BzCqZKckSf1He4PLizuzdvGJEfEX4HO0Kq1vAlYs3TFJ\nUv/Q3uBMtjNzsgDP0wqwXwQ2Ah4p1iNJkhpidotRvB/4crW9CFwORGZu2Et9kyT1A/11TvZR4A5g\ny8ycABAR3+6VXkmS+o2+XiHcE7MbLt4emAzcGhFnRcTG0OA/NyRJtUiiR1tfNssgm5m/ysydgQ8C\ntwLfAt4bEWdGxOd7q4OSJM2r5lj4lJmvZuYlmbkVrXWM7wcOK94zSVK/0N8Xo5ihWu1pZLVJktRj\nfT1Q9kSXgqwkSXNbX59X7QmDrCSpVu3NjbGdXoxCkiR1kZmsJKlWTV5W0SArSapVg+8PYJCVJNXL\n6mJJkgppj+YOF1v4JElSIWaykqRaOScrSVIhzslKklSIi1FIkqQuM5OVJNXKxSgkSSrEwidJkgpp\n8pysQVaSVKsmVxdb+CRJUiFmspKkWjknK0lSIU2ek3W4WJJUq/Yebp0REYtFxFUR8WhEPBIR60bE\nEhFxc0Q8Xj0uXh0bEfG/ETEhIh6MiI9197sZZCVJteqNIAucBtyYmR8E1gAeAQ4HbsnMVYFbqucA\nmwGrVtsI4MzufjeDrCSp0SJiUWB94ByAzHwrM6cA2wCjqsNGAdtW+9sAF2TL3cBiEbFsd85tkJUk\n1SqjZ1tEjIiIeztsI951iuHAP4DzIuL+iDg7IhYEhmbm5OqY54Ch1f7ywDMd3j+pausyC58kSbXq\n6XWymTkSGDmbQwYBHwMOyMwxEXEabw8NT/+MjIi5XuhsJitJqlUvzMlOAiZl5pjq+VW0gu7z04eB\nq8cXqtefBVbo8P5hVVuXGWQlSY2Wmc8Bz0TEB6qmjYGHgdHA7lXb7sC11f5o4KtVlfE6wMsdhpW7\nxOFiSVKtemkxigOAiyNiPmAisCetRPOKiNgLeBr4UnXsb4DNgQnAa9Wx3WKQlSTVqjcWo8jMB4BP\nzOSljWdybAL7zY3zGmQlSbVq8g0CDLKSpFo1Ocha+CRJUiFmspKkWnkXHkmSCmnyXXgMspKkWjV5\nTtYgK0mqVZOHiy18kiSpEDNZSVKt2hucyxpkJUm1ck5WkqRCmpvHOicrSVIxZrKSpFo5XCxJUiEu\nRiFJUiFWF0uSVEhzQ6yFT5IkFWMmK0mqlYVPkiQV4pysJEmFNDfEGmQlSTVr8nCxhU+SJBViJitJ\nqpVzspIkFdLcEGuQlSTVzDlZSZLUZWaykqRaZYMHjA2ykqRaNXm42CArSaqV1cWSJBXS3BBr4ZMk\nScUYZPuJkb84mUnPPMD9f/7djLY1Pvph7rh9NGPvuYm77vw1n/jEmjX2UJq5ocu9lzOv/AmX/+EC\nLr91FDvvtSMABx61L1fefiGX/O48fnTOcSy0yELvfN/y7+W2x29k1312rqPb6oJ2skdbX2aQ7Scu\nuPBKttxq13e0Hf/DIzjuf07lk2t/gWO/fwo/PP6ImnonzVpb2zR+8v2fsdMGX2XPLfdhxz22Y/iq\nKzLm9nvZecM92OVze/K3iZPY44B3/r6/ffT+3Pn7MTX1Wl3R3sOtLzPI9hN//OMYXnppyjvaMpNF\nFm799b/oIgszefLzdXRNmq1/vvBPHhv3VwBee/V1nprwNEsvuzRjbhvLtGnTAHjovvEMXXbpGe/5\n7Kaf4e/PTGbiX5+qo8vqouzhf31Z0cKniBgIbAGs1PFcmfnjkudV5xx88DFcf93FnHDCUQwYMIDP\nbrBN3V2SZmvZYcvwgdVXZfyfH35H+9Zf3pybr/09AEMWGMJXv7EL++/8HXbd16HieUFfz0Z7onQm\nex2wB7AksHCHbaYiYkRE3BsR97ZPe7Vw1zRixFc55JBjWXmVtTnkkGP4xS9OrrtL0iwNWWAIJ579\nA378vZ/y6n9em9G+54G70dY2jRuuuRmAEQfvyaVnXcnrr71eV1elGUpfwjMsMz/a2YMzcyQwEmC+\nwcP69hhAA+y2644cdND3ALjq6uv5+c9PqrlH0swNHDSQE8/+ATdeczO33nD7jPYtv7Qpn/ncunxj\np2/PaFttrQ+x0Raf5YAj92HhRRaivT158823uPK8a+roujqhrw/59kTpIHtDRHw+M39b+DzqhsmT\nn2f99dfl9tvvYsMNP82ECU/W3SVppo465TCeevxpLhl5xYy2dTdYm92+sQt7b38Ab77+5oz2Edsd\nMGP/69/Zk9dffd0A28c1ebi4dJC9G/hlRAwApgIBZGYuUvi8epcLLzid9ddfl6WWWoKJT4zl+z84\nhX32PZQfn3IsgwYN4o033mTfbxxWdzel/2eNtT/CFl/clMcffoKLbz4HgDN+eBYH/+BA5hs8H2dc\n3irxGHffw5xw+Cl1dlXd1J7NzWQjC365iHgS2AYYl108kcPFaoI1lnxf3V2Q5oqxf789Sn32bitu\n36N/7y98+ppifeup0pnsM8BDXQ2wkqT+o8kBonSQnQj8ISJuAGZMmngJjyRpur6+alNPlA6yT1bb\nfNUmSdI7WF3cTZl5bMnPlyTN+6wu7qaIWBo4FFgNmH96e2ZuVPK8kiT1BaVXfLoYeBQYDhwLPAWM\nLXxOSdI8xLvwdN+SmXkOMDUzb8vMrwFmsZKkGbxBQPdNrR4nR8QWwN+BJQqfU5I0D3FOtvuOi4hF\nge8APwUWAb49+7dIkvqTJi+lULq6+Ppq92Vgw5LnkiSpryk6JxsR74uI6yLixYh4ISKujQjXmZMk\nzWDhU/ddAlwBLAMsB1wJXFr4nJKkeUh7D7e+rHSQXSAzL8zMtmq7iA7Xy0qSVLq6OCLmj4h7IuIv\nETE+Io6t2odHxJiImBARl0fEfFX74Or5hOr1lbr73UoH2Rsi4vCIWCkiVoyIQ4HfRMQSEWGVsSSp\nN7wJbJSZawBrAptGxDrAicCpmbkK8BKwV3X8XsBLVfup1XHdUrq6+EvV4968faOFAHaunjs/K0n9\nXOl51epOcP+pnr6n2pLWug27VO2jgGOAM2ndovWYqv0q4PSIiO7cUa50JnsYsEZmDgfOA/4C7JCZ\nwzPTACtJIjN7tHVGRAyMiAeAF4CbgSeAKZnZVh0yCVi+2l+e1q1aqV5/GViyO9+tdJA9MjNfiYjP\n0PqL4WxafyVIkgT0vPApIkZExL0dthHvPkdmTsvMNYFhwNrAB4t/McoPF0+rHrcAzsrMX0fEcYXP\nKUmah/R0acTMHAmM7OSxUyLiVmBdYLGIGFRlq8OAZ6vDngVWACZFxCBgUeCf3elb6Uz22Yj4BbAT\nrYKnwb1wTkmSZoiIpSNisWp/CLAJ8AhwK7BjddjuwLXV/ujqOdXrv+/OfCz0TuHTpsDJ1V8PywKH\nFD6nJGke0gsLSiwLjIqIgbQSvSsy8/qIeBi4rBphvR84pzr+HODCiJgA/ItWsW63lF5W8TXgmg7P\nJwOTS55TkjRvKb12cWY+CKw1k/aJtOZn393+BvDFuXHu0pmsJEmz1deXRuwJ50clSSrETFaSVKu+\nfuP1njDISpJq1e79ZCVJKqO5IdYgK0mqmYVPkiSpy8xkJUm1anIma5CVJNWq9GIUdTLISpJqZSYr\nSVIhTb5O1sInSZIKMZOVJNXKOVlJkgpxTlaSpEKanMk6JytJUiFmspKkWjlcLElSIU2+hMcgK0mq\nlbe6kySpkCZnshY+SZJUiJmsJKlWDhdLklRIk4eLDbKSpFqZyUqSVEiTM1kLnyRJKsRMVpJUK4eL\nJUkqpMnDxQZZSVKtMtvr7kIxzslKklSImawkqVbehUeSpEKafNN2g6wkqVZmspIkFdLkTNbCJ0mS\nCjGTlSTVysUoJEkqxMUoJEkqpMlzsgZZSVKtmlxdbOGTJEmFmMlKkmrlcLEkSYVYXSxJUiFNzmSd\nk5UkqRAzWUlSrZpcXWyQlSTVqsnDxQZZSVKtLHySJKmQJi+raOGTJEmFmMlKkmrlcLEkSYVY+CRJ\nUiFNnpM1yEqSatXkTNbCJ0mSCjGTlSTVqsmZrEFWklSr5oZYiCb/BaHZi4gRmTmy7n5IPeVvWX2V\nc7L924i6OyDNJf6W1ScZZCVJKsQgK0lSIQbZ/s05LDWFv2X1SRY+SZJUiJmsJEmFGGQbKCJWioiH\n6u6HJPV3BllJkgoxyDbXwIg4KyLGR8RvI2JIRHw9IsZGxF8i4uqIWAAgIs6PiDMj4u6ImBgRG0TE\nuRHxSEScX/P3UD8TEQtGxK+r3+lDEbFTRDwVET+KiHERcU9ErFIdu1VEjImI+yPidxExtGo/JiJG\nRcQdEfF0RGzf4f03RsR76v2W6i8Mss21KnBGZq4GTAF2AK7JzE9m5hrAI8BeHY5fHFgX+DYwGjgV\nWA34SESs2as9V3+3KfD3zFwjM1cHbqzaX87MjwCnAz+p2v4IrJOZawGXAYd2+JyVgY2ArYGLgFur\n978ObFH+a0gG2SZ7MjMfqPbvA1YCVq/+sh8HfIVWEJ3uumyVmo8Dns/McZnZDoyv3iv1lnHAJhFx\nYkSsl5kvV+2Xdnhct9ofBtxU/aYP4Z2/6Rsyc2r1eQN5O1iPw9+0eolBtrne7LA/jdbNIM4H9q/+\nmj8WmH8mx7e/673teCMJ9aLM/CvwMVrB8LiI+N70lzoeVj3+FDi9+k3vzUx+09Ufi1Pz7esV/U2r\n1xhk+5eFgcnVfNRX6u6MNDMRsRzwWmZeBJxEK+AC7NTh8a5qf1Hg2Wp/917rpNRJ/jXXvxwFjAH+\nUT0uXG93pJn6CHBSRLQDU4F9gauAxSPiQVoZ6perY48BroyIl4DfA8N7v7vSrLnik6Q+LyKeAj6R\nmS/W3RepKxwuliSpEDNZSZIKMZOVJKkQg6wkSYUYZKVCImK1iNi67n5Iqo9BVuqkiJgWEQ9U6+le\nOX3t51kc+1/AEcAfZvH6BhFxfbW/dUQcXu1vGxEfLtB9STUwyEqd93pmrlmtp/sWsE/HF6NlAEBm\n/i0zd8nMV+b0oZk5OjNPqJ5uCxhkpYYwyErdcwewSnXv3sci4gLgIWCFiPh8RNwVEX+uMt6FACJi\n04h4NCL+DGw//YMiYo+IOD0iPkVrMfuTqox55Tq+mKS5xyArdVFEDAI2o7W2LrTuePSz6o5HrwJH\nAp/LzI8B9wIHRcT8wFnAVsDHgWXe/bmZeSetOyAdUmXMTxT/MpKKcllFqfOGRMT0OxvdAZwDLAc8\nnZl3V+3r0Bru/VNEAMxHa53dD9K6M9LjABFxETCiF/suqQYGWanzXs/Md9xbtwqkr3ZsAm7OzC+/\n6zjvySv1Qw4XS3PX3cCnI2IVgIhYMCLeDzwKrNRhnvXLs3j/v/HGDVJjGGSluSgz/wHsAVxa3THm\nLuCDmfkGreHhX1eFTy/M4iMuAw6JiPstfJLmfa5dLElSIWaykiQVYpCVJKkQg6wkSYUYZCVJKsQg\nK0lSIQZZSZIKMchKklSIQVaSpEL+DxIeuChB+c36AAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HGE0AjFI5KJ2", - "colab_type": "text" - }, - "source": [ - "On remarque un petit peu d'amélioration, alors SVM est un bon choix pour ce cas de dataSet " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8iz2DK4OpaFh", - "colab_type": "code", - "outputId": "037567ac-bdfd-46f7-a4d0-2d945a09b883", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 615 - } - }, - "source": [ - "from IPython.display import display\n", - "\n", - "for predicted in category_id_df.category_id:\n", - " for actual in category_id_df.category_id:\n", - " if predicted != actual and conf_mat[actual, predicted] >= 6:\n", - " print(\"'{}' est prédit comme '{}' : {} exemples.\".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))\n", - " display(df_stem.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['Category', 'Message_stemmed_without_StopWords']])\n", - " print('')" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "'spam' est prédit comme 'ham' : 18 exemples.\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMessage_stemmed_without_StopWords
684spamhi i 'm sue . i 20 year old work lapdanc . i love sex . text live - i 'm bedroom . text sue 89555 . by textoper g2 1da 150ppmsg 18+
731spamemail alertfrom : jeri stewarts : 2kbsubject : low-cost prescripiton drvgsto listen email call 123
1940spammore peopl dog area . call 09090204448 join like mind guy . whi arrang 1 . there 's 1 even . a£1.50 minapn ls278bb
751spamdo realiz 40 year , ll thousand old ladi run around tattoo ?
4213spammiss call alert . these number call left messag . 07008009200
4298spamthesmszone.com let send free anonym mask messages..im send messag there..do see potenti abus ? ? ?
5466spamhttp//tms . widelive.com/index . wml ? id=820554ad0a1705572711 & first=true¡c c ringtone¡
333spamcall germani 1 penc per minut ! call fix line via access number 0844 861 85 85 . no prepay . direct access !
3864spamoh god ! i ve found number ! i 'm glad , text back xafter msgs cst std ntwk chg £1.50
5449spamlatest news ! polic station toilet stolen , cop noth go !
1430spamfor sale - arsenal dartboard . good condit doubl trebl !
2247spamhi ya babe x u 4goten bout ? ' scammer get smart..though regular vodafon , respond get prem rate msg/subscript . other nos use also . bewar !
3991spam( bank granit issu strong-buy ) explos pick for our member *****up over 300 % *********** nasdaq symbol cdgt that $ 5.00 per..
3460spamnot heard u4 . call night knicker . make beg like u last time 01223585236 xx luv nikiyu4.net
2823spamromcapspam everyon around respond well presenc sinc warm outgo . you bring real breath sunshin .
2402spambabe : u want dont u babi ! im nasti thing 4 filthyguy . fanci rude time sexi bitch . how go slo n hard ! txt xxx slo ( 4msgs )
3501spamdorothi @ kiefer.com ( bank granit issu strong-buy ) explos pick for our member *****up over 300 % *********** nasdaq symbol cdgt that $ 5.00 per..
4821spamcheck out choos your babe video @ sms.shsex.netun fgkslpopw fgkslpo
\n", - "
" - ], - "text/plain": [ - " Category Message_stemmed_without_StopWords\n", - "684 spam hi i 'm sue . i 20 year old work lapdanc . i love sex . text live - i 'm bedroom . text sue 89555 . by textoper g2 1da 150ppmsg 18+ \n", - "731 spam email alertfrom : jeri stewarts : 2kbsubject : low-cost prescripiton drvgsto listen email call 123 \n", - "1940 spam more peopl dog area . call 09090204448 join like mind guy . whi arrang 1 . there 's 1 even . a£1.50 minapn ls278bb \n", - "751 spam do realiz 40 year , ll thousand old ladi run around tattoo ? \n", - "4213 spam miss call alert . these number call left messag . 07008009200 \n", - "4298 spam thesmszone.com let send free anonym mask messages..im send messag there..do see potenti abus ? ? ? \n", - "5466 spam http//tms . widelive.com/index . wml ? id=820554ad0a1705572711 & first=true¡c c ringtone¡ \n", - "333 spam call germani 1 penc per minut ! call fix line via access number 0844 861 85 85 . no prepay . direct access ! \n", - "3864 spam oh god ! i ve found number ! i 'm glad , text back xafter msgs cst std ntwk chg £1.50 \n", - "5449 spam latest news ! polic station toilet stolen , cop noth go ! \n", - "1430 spam for sale - arsenal dartboard . good condit doubl trebl ! \n", - "2247 spam hi ya babe x u 4goten bout ? ' scammer get smart..though regular vodafon , respond get prem rate msg/subscript . other nos use also . bewar ! \n", - "3991 spam ( bank granit issu strong-buy ) explos pick for our member *****up over 300 % *********** nasdaq symbol cdgt that $ 5.00 per.. \n", - "3460 spam not heard u4 . call night knicker . make beg like u last time 01223585236 xx luv nikiyu4.net \n", - "2823 spam romcapspam everyon around respond well presenc sinc warm outgo . you bring real breath sunshin . \n", - "2402 spam babe : u want dont u babi ! im nasti thing 4 filthyguy . fanci rude time sexi bitch . how go slo n hard ! txt xxx slo ( 4msgs ) \n", - "3501 spam dorothi @ kiefer.com ( bank granit issu strong-buy ) explos pick for our member *****up over 300 % *********** nasdaq symbol cdgt that $ 5.00 per..\n", - "4821 spam check out choos your babe video @ sms.shsex.netun fgkslpopw fgkslpo " - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "_J_FXRewpaFk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6EnkB1U75g_q" - }, - "source": [ - "\n", - "## 6.2. Le cas du Lemmatization" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "qaFeUq4x5g_v" - }, - "source": [ - "Création de dataframe spécifique au message Lemmatizé" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "eo0mCQlF5g_y", - "colab": {} - }, - "source": [ - "col = ['Category', 'Message_lemmatized_before_stemming','category_id']\n", - "df_lem = spam[col]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "20be06a6-f249-4a26-8697-a51975167dc4", - "id": "O1FDhanu5g_1", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 212 - } - }, - "source": [ - "df_lem.head()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMessage_lemmatized_before_stemmingcategory_id
0hamGo jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ...0
1hamOk lar ... Joking wif u oni ...0
2spamFree entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's1
3hamU dun say early hor ... U c already say ...0
4hamNah I n't think go usf , life around though0
\n", - "
" - ], - "text/plain": [ - " Category ... category_id\n", - "0 ham ... 0 \n", - "1 ham ... 0 \n", - "2 spam ... 1 \n", - "3 ham ... 0 \n", - "4 ham ... 0 \n", - "\n", - "[5 rows x 3 columns]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 34 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ptWn7yRg5g_5" - }, - "source": [ - "Maintenant, chacun des 5572 messages Lemmatizés est représenté par 2534 fonctionnalités, représentant le score tf-idf pour différents unigrammes et bigrammes" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "46a61288-8350-44ae-a200-99c5255385c0", - "id": "VMj1vFpJ5g_6", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "corpus_stem = df_lem.Message_lemmatized_before_stemming\n", - "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))\n", - "featuresLem = vectorizer.fit_transform(corpus_stem).toarray()\n", - "labels = df_lem.category_id\n", - "featuresLem.shape" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(5572, 2566)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 36 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "m3AcMEM55g__" - }, - "source": [ - "#### 6.2.1. Classification à l'aide du Naive bayes" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "GDVyKOtE5hAA", - "colab": {} - }, - "source": [ - "from sklearn.model_selection import train_test_split\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.feature_extraction.text import TfidfTransformer\n", - "from sklearn.naive_bayes import MultinomialNB\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(df_lem['Message_lemmatized_before_stemming'], df_lem['Category'], random_state = 0)\n", - "count_vect = CountVectorizer()\n", - "X_train_counts = count_vect.fit_transform(X_train)\n", - "tfidf_transformer = TfidfTransformer()\n", - "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)\n", - "\n", - "clf = MultinomialNB().fit(X_train_tfidf, y_train)\n", - "y_pred = clf.predict(count_vect.transform(X_test))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ubAOz_Fn71H7", - "colab_type": "text" - }, - "source": [ - "#####6.2.1.1. F1-Score" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "d600749a-cca9-4fe6-a007-127243502c1d", - "id": "Farl3n865hAC", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 175 - } - }, - "source": [ - "from sklearn import metrics\n", - "print(metrics.classification_report(y_test, y_pred, target_names=df_lem['Category'].unique()))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " ham 0.98 1.00 0.99 1208\n", - " spam 1.00 0.88 0.94 185\n", - "\n", - " accuracy 0.98 1393\n", - " macro avg 0.99 0.94 0.96 1393\n", - "weighted avg 0.98 0.98 0.98 1393\n", - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "97338842-307d-4c9f-c05c-463e8fddab23", - "id": "Q0P9HCGM5hAF", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "print(clf.predict(count_vect.transform([\"Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's\"])))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "['spam']\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "0cc5c06a-b266-4fa0-b097-8dcdd330b4b3", - "id": "ptPFEuio5hAI", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "print(clf.predict(count_vect.transform([\"Nah I n't think go usf , life around though\"])))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "['ham']\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "HzNSltF75hAL" - }, - "source": [ - "On remarque, il y a 22 messages qui a été confusioner (22 ham comme spam, 0 spam comme ham)" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "2ce176b5-dcbb-4bb8-f586-eb597c56d629", - "id": "xBaEUblc5hAN", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 388 - } - }, - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "conf_mat = confusion_matrix(y_test, y_pred)\n", - "fig, ax = plt.subplots(figsize=(8,6))\n", - "sns.heatmap(conf_mat, annot=True, fmt='d',\n", - " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", - "plt.ylabel('Actuel')\n", - "plt.xlabel('Prédit')\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdkAAAFzCAYAAABywHOKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAeOklEQVR4nO3deZRdVZX48e9OhQREhIBCIImEyR+C\nQBBERF2CKBiVQVGGVgRkGX+20iLtgEqj2Djb+GtaGw10NExhUBGQSUBEWpkCBMKokUEIYVIBZchQ\ntX9/vJvwiBmq8nLqVu77flh31Xun7nvnPNZbtbPP2ffcyEwkSdLKN6zuAUiS1FQGWUmSCjHISpJU\niEFWkqRCDLKSJBVikJUkqZDhdQ9gaeY/ca/XFmmVt8ZGb657CNJKsWDe7Cj13p3+vV/t5ZsWG1un\nhmyQlSR1ib7eukdQjNPFkiQVYiYrSapX9tU9gmIMspKkevUZZCVJKiIbnMm6JitJUiFmspKkejld\nLElSIQ2eLjbISpLq1eDrZA2ykqR6NTiTtfBJkqRCzGQlSfWy8EmSpDKafJ2sQVaSVC8zWUmSCmlw\nJmvhkyRJhZjJSpLq5XWykiQV0uDpYoOsJKleDS58ck1WkqRCzGQlSfVyuliSpEIaPF1skJUk1SrT\n6mJJkspo8HSxhU+SJBViJitJqpdrspIkFdLg6WKDrCSpXm6rKElSIQ3OZC18kiSpEDNZSVK9LHyS\nJKmQBk8XG2QlSfVqcCbrmqwkSYWYyUqS6tXgTNYgK0mqlTcIkCSpFDNZSZIKaXB1sYVPkiQVYiYr\nSaqX08WSJBXS4Olig6wkqV5mspIkFdLgTNbCJ0mSCjGTlSTVy+liSZIKMchKklSIa7KSJGmgzGQl\nSfVyuliSpEIaPF1skJUk1avBmaxrspKkemVfZ8dyRMSUiHgsIm5va1s3Ii6PiD9UP0dV7RERJ0bE\nrIi4LSJe2/aaQ6rz/xARh/TnoxlkJUlN92PgHYu1HQ1cmZlbAFdWzwEmAltUxyTgJGgFZeBLwOuB\nnYAvLQzMy2KQlSTVq6+vs2M5MvM3wF8Wa94HmFo9ngrs29Z+arZcB6wTERsCewKXZ+ZfMvOvwOX8\nY+D+B67JSpLqVc+a7AaZOad6/AiwQfV4DPBg23kPVW1La18mM1lJUr0yOzoiYlJETG87Jg2s+0wg\nS3w0M1lJUr06zGQzczIweYAvezQiNszMOdV08GNV+2xgXNt5Y6u22cCui7X/enmdmMlKkrrRBcDC\nCuFDgPPb2j9UVRnvDDxVTStfBuwREaOqgqc9qrZlMpOVJNWr8JpsREyjlYW+PCIeolUl/A3gnIg4\nHHgA2L86/WLgncAs4FngMIDM/EtE/DtwY3XeVzJz8WKqf2CQlSTVq/COT5l50FJ+tfsSzk3g40t5\nnynAlIH0bZCVJNXLHZ8kSdJAmclKkuqVRa6eGRIMspKkejV4utggK0mql0FWkqRCGnw/WQufJEkq\nxExWklSr7LPwSZKkMlyTlSSpkAavyRpkJUn1avB0sYVPkiQVYiYrSaqXa7KSJBVikJUkqZAG713s\nmqwkSYWYyTbMMV87gd/89gbWHbUOPz/9BwB853uncPVvr2f4asMZN2ZDjv/CUbxsrZcCcPKpZ/Oz\nX1xGz7BhfP5TH+ONr98BgFPPOo+fXngpEcEWm43n+C8cxciRI2r7XNLS7LnHrpxwwlfoGTaMKT+a\nxre+/f26h6SBavB0sZlsw+z7zrfzgxOOf1HbG163Peed9gPOO/Ukxo8bwymnnQ3AH+97gEuuvJrz\nT/8BPzjheP79O9+jt7eXRx9/gjN+cj5nTzmRn5/+A/r6+rjkiqvr+DjSMg0bNowT//OrvHuvD7LN\ndrtxwAH78upXb1H3sDRQfdnZMYQZZBtmxwnbsPbL1npR2xtfvwPDh/cAsO3WW/LoY08A8KtrrmPi\n7m9hxIgRjN1oNK8cuxEz7/o9AAt6e5k7dx4LFvTy3PNzecXL1x3cDyL1w06v254//vF+7rvvT8yf\nP59zzjmfvffas+5haaCyr7NjCCs+XRwR2wLj2/vKzJ+V7ldLdt5Fv+Qdu78FgMce/zPbvmbLRb/b\nYP2X89jjTzDhNa/m0IP2423v/RCrjxzBLq977aJpZGko2WjMaB586OFFzx+aPYedXrd9jSPSChni\n2WgnimayETEFmALsB+xVHe9exvmTImJ6REw/5dRpJYfWlX44dRo9PT28e4/dlnneU0//jauuuY7L\nzv0Rvzr/DJ57fi4XXvarQRqlJDVH6Ux258zcqr8nZ+ZkYDLA/Cfube4/bWrw84su5ze/vYFTTvw6\nEQHA+q9Yj0cefXzROY8+9gTrv+LlXDd9BmM22oB1R60DwO5v2YUZM+9krz3fWsvYpaV5ePYjjBu7\n0aLnY8dsyMMPP1LjiLQi0sKnFXZtRPQ7yKqM/71uOlPOPJf/+uaXWGP11Re17/amnbnkyquZN28e\nDz38CH966GG2efWr2HCDV3Db7Xfz3PPPk5lcP30Gm248rsZPIC3ZjdNnsPnmmzB+/DhWW2019t9/\nHy78xS/rHpYGqsGFT6Uz2VNpBdpHgLlAAJmZ2xbut2t95kvf4MZbbuPJJ59m930/yD8ffjCnnHY2\n8+bP5yNHfhFoFT996bNHsPmmG7PnW9/M3h/4KMN7evjiUf9MT08P2269JW/f7U3sf9gR9PT0sOWr\nNuP9+0ys+ZNJ/6i3t5dPHnkMF190Jj3DhvHjqWdz552/r3tYGqghXrzUiciCO21ExCzgKGAmsOj/\nYmY+sLzXOl2sJlhjozfXPQRppVgwb3aUeu9njv9gR3/v1zzm9GJj61TpTPbxzLygcB+SpFXZEJ/y\n7UTpIHtLRJwJXEhruhjwEh5JUpsGFz6VDrJr0Aque7S1JWCQlSS1mMmumMw8rOT7S5IaoMGFT0WD\nbESsDhwObA0sunYkMz9csl9JkoaC0tfJngaMBvYErgbGAn8r3KckaVXidbIrbPPMfH9E7JOZU6si\nqGsK9ylJWoU0ecen0kF2fvXzyYh4DfAIsH7hPiVJq5Ihno12onSQnRwRo4BjgAuAlwL/VrhPSdKq\nxCC7wk6jdQee8cDUqm2Dwn1KkjQklA6y5wNPATfRthmFJEmLeAnPChubme8o3IckaVXmdPEK+11E\nbJOZMwv3I0laRaVBdmAiYiat7ROHA4dFxL14qztJUpcplcm+u9D7SpKaxkx2YPpzv1hJkgDvwiNJ\nUjFmspIkFdLgIFv6BgGSJHUtM1lJUq0ym5vJGmQlSfVq8HSxQVaSVC+DrCRJZTR5xycLnyRJKsRM\nVpJUrwZnsgZZSVK9mrvhk0FWklQv12QlSdKAmclKkurV4EzWICtJqpdrspIkldHkNVmDrCSpXg3O\nZC18kiSpEDNZSVKtmjxdbCYrSapXX4fHckTE/4mIGW3H0xFxZER8OSJmt7W/s+01n4+IWRFxT0Ts\nuaIfzUxWklSrLLwmm5n3ABMAIqIHmA2cBxwGfDczv9N+fkRsBRwIbA1sBFwREa/KzN6B9m0mK0mq\nV+FMdjG7A3/MzAeWcc4+wFmZOTcz7wNmATsNuCcMspKkVVxETIqI6W3HpGWcfiAwre35JyLitoiY\nEhGjqrYxwINt5zxUtQ2YQVaSVKvs6/DInJyZO7Ydk5fUT0SMAPYGzq2aTgI2ozWVPAf4j5X92VyT\nlSTVa/Cuk50I3JyZjwIs/AkQEScDv6iezgbGtb1ubNU2YGaykqRadZrJDsBBtE0VR8SGbb97D3B7\n9fgC4MCIGBkRmwBbADesyGczk5UkNV5ErAm8HfhoW/O3ImICkMD9C3+XmXdExDnAncAC4OMrUlkM\nBllJUs1KX8IDkJnPAOst1nbwMs7/KvDVTvs1yEqSajUYQbYuBllJUr0y6h5BMQZZSVKtmpzJWl0s\nSVIhZrKSpFpln9PFkiQV0eTpYoOsJKlWaeGTJEllNDmTtfBJkqRCzGQlSbWy8EmSpEIy6x5BOQZZ\nSVKtmpzJuiYrSVIhZrKSpFo1OZM1yEqSauWarCRJhZjJSpJUSJN3fLLwSZKkQsxkJUm1avK2igZZ\nSVKt+ho8XWyQlSTVqslrsksNshFxIbDUwurM3LvIiCRJXaVbq4u/M2ijkCSpgZYaZDPz6oWPI2IN\n4JWZec+gjEqS1DWavBnFci/hiYi9gBnApdXzCRFxQemBSZK6Q/ZFR8dQ1p/Cpy8DOwG/BsjMGRGx\nScExSZK6SJOri/uzGcX8zHxqsbYGJ/eSJK0c/clk74iIfwJ6ImIL4F+A35UdliSpWzT5Ep7+ZLJH\nAFsDc4FpwNPAkSUHJUnqHpmdHUPZcjPZzHwW+GJ1SJK0UjV5TXa5QTYirmIJa7CZ+dYiI5IkdZUm\nTxf3Z032022PVwf2AxaUGY4kSc3Rn+nimxZr+m1E3FBoPJKkLjPU11U70Z/p4nXbng4DdgDWLjai\nylpjdy3dhVTchPU2rXsI0pDX1WuywE201mSD1jTxfcDhJQclSeoe3b4m++rMfL69ISJGFhqPJKnL\nNDmT7c91skvaeOLalT0QSZKaZln3kx0NjAHWiIjtaU0XA7wMeMkgjE2S1AUaXPe0zOniPYFDgbHA\nf/BCkH0a+ELZYUmSukWTp4uXdT/ZqcDUiNgvM386iGOSJHWRJhc+9WdNdoeIWGfhk4gYFRHHFxyT\nJEmN0J8gOzEzn1z4JDP/Cryz3JAkSd2kr8NjKOvPJTw9ETEyM+cCRMQagJfwSJJWiqS508X9CbJn\nAFdGxI9oFT8dCkwtOShJUvfoa3B5cX/2Lv5mRNwKvI1WpfVlwMalByZJ6g59Dc5k+7MmC/AorQD7\nfuCtwF3FRiRJUkMsazOKVwEHVccTwNlAZOZugzQ2SVIX6NY12buBa4B3Z+YsgIj41KCMSpLUNYZ6\nhXAnljVd/F5gDnBVRJwcEbtDg/+5IUmqRRIdHUPZUoNsZv48Mw8EtgSuAo4E1o+IkyJij8EaoCRJ\nq6rlFj5l5jOZeWZm7kVrH+NbgM8VH5kkqSt0+2YUi1S7PU2uDkmSOjbUA2UnBhRkJUla2Yb6umon\nDLKSpFr1NTfG9nszCkmSNEBmspKkWjV5W0WDrCSpVg2+P4BBVpJUryZXF7smK0mqVV9ER0d/RMT9\nETEzImZExPSqbd2IuDwi/lD9HFW1R0ScGBGzIuK2iHjtin42g6wkqVvslpkTMnPH6vnRwJWZuQVw\nZfUcYCKwRXVMAk5a0Q4NspKkWmWHRwf2AaZWj6cC+7a1n5ot1wHrRMSGK9KBQVaSVKtB2lYxgV9G\nxE0RMalq2yAz51SPHwE2qB6PAR5se+1DVduAWfgkSapVp5tRVEFzUlvT5MxcfPvfN2Xm7IhYH7g8\nIu5u/2VmZkSs9EJng6wkaZVWBdRl7qmfmbOrn49FxHnATsCjEbFhZs6ppoMfq06fDYxre/nYqm3A\nnC6WJNWqj+joWJ6IWDMi1lr4GNgDuB24ADikOu0Q4Pzq8QXAh6oq452Bp9qmlQfETFaSVKtB2Ixi\nA+C8aF3uMxw4MzMvjYgbgXMi4nDgAWD/6vyLgXcCs4BngcNWtGODrCSpVqVvEJCZ9wLbLaH9z8Du\nS2hP4OMro2+DrCSpVu74JEmSBsxMVpJUK28QIElSIU2+abtBVpJUqyavyRpkJUm1anKQtfBJkqRC\nzGQlSbVK12QlSSqjydPFBllJUq2aHGRdk5UkqRAzWUlSrdyMQpKkQtyMQpKkQpq8JmuQlSTVqslB\n1sInSZIKMZOVJNXKwidJkgqx8EmSpEKavCZrkJUk1arJ08UWPkmSVIiZrCSpVn0NzmUNspKkWrkm\nK0lSIc3NY12TlSSpGDNZSVKtnC6WJKkQN6OQJKkQq4slSSqkuSHWwidJkooxk5Uk1crCJ0mSCnFN\nVpKkQpobYg2ykqSaNXm62MInSZIKMZOVJNXKNVlJkgppbog1yEqSauaarCRJGjAzWUlSrbLBE8YG\nWUlSrZo8XWyQlSTVyupiSZIKaW6ItfBJkqRiDLJdYuzYDbnssrO45ZYrufnmK/j4xz8MwNe+9gVu\nvfVX3HjjZZx99mTWXvtlNY9U+kfHnnA0v5x5AWdfNfVF7Qd8eD9+cs3pnP3rU/mXYz4GwNYTXs0Z\nl0/hjMuncOYVP2LXiW+uY8gagD6yo2Moi8yhOcDVV3/l0BzYKmr06PUZPXp9Zsy4nZe+dE2uvfYi\n3v/+jzB27Giuuup39Pb2cvzxnwfgmGO+XvNom+M1ozauewiNsP3O2/HsM8/xlRO/yAG7HQLADrts\nz4c/+SGOPPizzJ83n1HrrcNf//wkI9cYyYJ5C+jt7WW99ddj2pU/YuKE99Db21vzp1i1TZ9zTZR6\n74+Mf39Hf+9Pvv/cYmPrlJlsl3jkkceYMeN2AP7+92e4++5ZjBkzmiuuuGbRH58bbriZsWNH1zlM\naYluue5Wnv7r0y9qe98h+zL1e6czf958AP765ycBmPvc3EXf6ZEjRzBUEwm9IDv8bygrWvgUET3A\nu4Dx7X1l5gkl+9WybbzxWCZM2JobbrjlRe2HHHIAP/nJhTWNShqYV246jgmv345/PnoSc+fO4z+P\n+z533no3AFtvvxXHfvdoNhy7AccecbxZ7BDX5Et4SmeyFwKHAusBa7UdSxQRkyJiekRM7+39e+Gh\ndac113wJ06b9kE9/+jj+9rcX/h9/7nOfYMGCBUybdl6No5P6b/jwHtZe52Uc+q6PcuJX/puvTz5u\n0e/uuOVODtj1Q3xo4iQOO+KDjBg5osaRqpuVvoRnbGZu29+TM3MyMBlcky1h+PDhnHXWDznrrPM4\n//xLF7UffPD7mDhxdyZOPKjG0UkD8+icx/nVxVcDcMeMu8i+ZJ311uHJatoY4P4/PMCzzzzHZltu\nwl233lPXULUcQ33KtxOlM9lLImKPwn2on374w29z992zOPHEUxa1vf3tb+Gooz7G+953OM8993yN\no5MG5upLr2HHN74WaE0dD19tOE/++Uk2GrchPT09AIweuwHjN9+Yhx98pM6hajn6OjyGstKZ7HXA\neRExDJgPBJCZ6XUig2yXXV7HBz6wHzNn3sX1118CwLHHfosTTjiOkSNHcNFFZwBwww23cMQRX6hz\nqNI/+Op/f4kddtmeddZdm4tu+imTvzOF86ddxLHf/TxnXzWV+fMX8OVPfg2ACa/flkM+8QEWzF9A\nZvKNz5/AU395quZPoGXpa3BxWtFLeCLiPmAfYGYOsCOni9UEXsKjpih5Cc/BG7+3o7/3pz3wsyF7\nCU/pTPZB4PaBBlhJUvdocoAoHWTvBX4dEZcAcxc2egmPJGmhob5rUydKB9n7qmNEdUiS9CJNri4u\nGmQz87jlnyVJ6mZDvUK4E6V3fHoF8Flga2D1he2Z+daS/UqSNBSUvk72DOBuYBPgOOB+4MbCfUqS\nViFNvgtP6SC7Xmb+DzA/M6/OzA8DZrGSpEVK3yAgIsZFxFURcWdE3BERn6zavxwRsyNiRnW8s+01\nn4+IWRFxT0TsuaKfrXTh0/zq55yIeBfwMLBu4T4lSauQQViTXQD8a2beHBFrATdFxOXV776bmd9p\nPzkitgIOpLXUuRFwRUS8KjMHfKeJ0kH2+IhYG/hX4L+AlwGfKtynJGkVUnorhcycA8ypHv8tIu4C\nxizjJfsAZ2XmXOC+iJgF7ARcO9C+i04XZ+YvMvOpzLw9M3fLzB0y84KSfUqSukv7HdyqY9Iyzh0P\nbA9cXzV9IiJui4gpETGqahtDazOlhR5i2UF5qYoG2YjYNCIujIgnIuKxiDg/IjYt2ackadXSaeFT\nZk7OzB3bjslL6iciXgr8FDgyM58GTgI2AybQynT/Y2V/ttKFT2cC5wCjac1rnwtMK9ynJGkVMhh3\n4YmI1WgF2DMy82cAmfloZvZmZh9wMq0pYYDZwLi2l4+t2gasdJB9SWaelpkLquN02q6XlSRpEKqL\nA/gf4K72bX0jYsO2094D3F49vgA4MCJGRsQmwBbADSvy2UoXPl0SEUcDZ9HaA/oA4OKIWBcgM/9S\nuH9Jkt4IHAzMjIgZVdsXgIMiYgKt+HQ/8FGAzLwjIs4B7qRVmfzxFakshvJBdv/q50d54UYLQas0\nOgHXZyWpy5XeUCIz/5dW7Fncxct4zVeBr3bad+np4s8B22XmJsCPgFuB/TJzk8w0wEqSyMyOjqGs\ndJA9JjOfjog30drp6RRa1VySJAGDU/hUl9JBduEc9ruAkzPzIrzlnSSpTenCpzqVDrKzI+KHvFDw\nNHIQ+pQkaUgoHfD2By4D9szMJ2ntW/yZwn1KklYhTb4LT+mbtj8L/Kzt+aL9IyVJgvJ7F9ep9CU8\nkiQt01DPRjvh+qgkSYWYyUqSajXUK4Q7YZCVJNWqzzVZSZLKaG6INchKkmpm4ZMkSRowM1lJUq2a\nnMkaZCVJtXIzCkmSCjGTlSSpkCZfJ2vhkyRJhZjJSpJq5ZqsJEmFuCYrSVIhTc5kXZOVJKkQM1lJ\nUq2cLpYkqZAmX8JjkJUk1cpb3UmSVEiTM1kLnyRJKsRMVpJUK6eLJUkqpMnTxQZZSVKtzGQlSSqk\nyZmshU+SJBViJitJqpXTxZIkFdLk6WKDrCSpVpl9dQ+hGNdkJUkqxExWklQr78IjSVIhTb5pu0FW\nklQrM1lJkgppciZr4ZMkSYWYyUqSauVmFJIkFeJmFJIkFdLkNVmDrCSpVk2uLrbwSZKkQsxkJUm1\ncrpYkqRCrC6WJKmQJmeyrslKklSImawkqVZNri42yEqSatXk6WKDrCSpVhY+SZJUSJO3VbTwSZKk\nQsxkJUm1crpYkqRCLHySJKkQ12QlSSokMzs6+iMi3hER90TErIg4uvBHWsQgK0lqtIjoAb4PTAS2\nAg6KiK0Go2+niyVJtRqENdmdgFmZeS9ARJwF7APcWbpjM1lJUq2yw6MfxgAPtj1/qGorbshmss8/\n/6eoewxNFxGTMnNy3eOQOuV3edW2YN7sjv7eR8QkYFJb0+Sh8n0wk+1uk5Z/irRK8LvcxTJzcmbu\n2HYsHmBnA+Pano+t2oozyEqSmu5GYIuI2CQiRgAHAhcMRsdDdrpYkqSVITMXRMQngMuAHmBKZt4x\nGH0bZLvbkFizkFYCv8tapsy8GLh4sPuNJm9nJUlSnVyTlSSpEINsA0XE+Ii4ve5xSFK3M8hKklSI\nQba5eiLi5Ii4IyJ+GRFrRMRHIuLGiLg1In4aES8BiIgfR8RJEXFdRNwbEbtGxJSIuCsiflzz51CX\niYg1I+Ki6nt6e0QcEBH3R8S3ImJmRNwQEZtX5+4VEddHxC0RcUVEbFC1fzkipkbENRHxQES8t+31\nl0bEavV+SnULg2xzbQF8PzO3Bp4E9gN+lpmvy8ztgLuAw9vOHwW8AfgUrevHvgtsDWwTERMGdeTq\ndu8AHs7M7TLzNcClVftTmbkN8D3g/1Vt/wvsnJnbA2cBn217n82AtwJ7A6cDV1Wvfw54V/mPIRlk\nm+y+zJxRPb4JGA+8pvqX/UzgA7SC6EIXZqvUfCbwaGbOzMw+4I7qtdJgmQm8PSK+GRFvzsynqvZp\nbT/fUD0eC1xWfac/w4u/05dk5vzq/Xp4IVjPxO+0BolBtrnmtj3upXVN9I+BT1T/mj8OWH0J5/ct\n9to+vJ5agygzfw+8llYwPD4ijl34q/bTqp//BXyv+k5/lCV8p6t/LM7PF65X9DutQWOQ7S5rAXOq\n9agP1D0YaUkiYiPg2cw8Hfg2rYALcEDbz2urx2vzwh60hwzaIKV+8l9z3eXfgOuBx6ufa9U7HGmJ\ntgG+HRF9wHzgY8BPgFERcRutDPWg6twvA+dGxF+BXwGbDP5wpaVzxydJQ15E3A/smJlP1D0WaSCc\nLpYkqRAzWUmSCjGTlSSpEIOsJEmFGGSlQiJi64jYu+5xSKqPQVbqp4jojYgZ1X665y7c+3kp574S\n+CLw66X8fteI+EX1eO+IOLp6vG9EbFVg+JJqYJCV+u+5zJxQ7ac7D/i/7b+MlmEAmfmnzPynzHx6\neW+amRdk5jeqp/sCBlmpIQyy0oq5Bti8unfvPRFxKnA7MC4i9oiIayPi5irjfSlARLwjIu6OiJuB\n9y58o4g4NCK+FxG70NrM/ttVxrxZHR9M0spjkJUGKCKGAxNp7a0LrTse/Xd1x6NngGOAt2Xma4Hp\nwFERsTpwMrAXsAMwevH3zczf0boD0meqjPmPxT+MpKLcVlHqvzUiYuGdja4B/gfYCHggM6+r2nem\nNd3724gAGEFrn90tad0Z6Q8AEXE6MGkQxy6pBgZZqf+ey8wX3Vu3CqTPtDcBl2fmQYud5z15pS7k\ndLG0cl0HvDEiNgeIiDUj4lXA3cD4tnXWg5by+r/hjRukxjDISitRZj4OHApMq+4Ycy2wZWY+T2t6\n+KKq8OmxpbzFWcBnIuIWC5+kVZ97F0uSVIiZrCRJhRhkJUkqxCArSVIhBllJkgoxyEqSVIhBVpKk\nQgyykiQVYpCVJKmQ/w92vDj/dBxzaAAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ttDjPV7E5hAQ" - }, - "source": [ - "#### 6.2.2. Classification par Linear SVC (svm)" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "YfmW2GnL5hAR", - "colab": {} - }, - "source": [ - "from sklearn.model_selection import train_test_split\n", - "from sklearn.svm import LinearSVC\n", - "\n", - "model = LinearSVC()\n", - "\n", - "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(featuresLem, labels, df_lem.index, test_size=0.33, random_state=0)\n", - "model.fit(X_train, y_train)\n", - "y_pred = model.predict(X_test)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "yg8lmpWk5hAT" - }, - "source": [ - "On remarque le f1-score du LinearSVC est améliorer comparant au Naive Bayes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tX3QDxsG7_1p", - "colab_type": "text" - }, - "source": [ - "#####6.2.2.1 f1-score" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "26886a4a-ce28-4269-d323-fe0f7e03844a", - "id": "1bI6ucAa5hAW", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 175 - } - }, - "source": [ - "from sklearn import metrics\n", - "print(metrics.classification_report(y_test, y_pred, target_names=df_stem['Category'].unique()))" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " ham 0.99 1.00 0.99 1597\n", - " spam 1.00 0.93 0.96 242\n", - "\n", - " accuracy 0.99 1839\n", - " macro avg 0.99 0.96 0.98 1839\n", - "weighted avg 0.99 0.99 0.99 1839\n", - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "1cdc9cd7-8fe7-4967-8653-ca5ac3dd0470", - "id": "_YHFm-MZ5hAY", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 388 - } - }, - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "conf_mat = confusion_matrix(y_test, y_pred)\n", - "fig, ax = plt.subplots(figsize=(8,6))\n", - "sns.heatmap(conf_mat, annot=True, fmt='d',\n", - " xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)\n", - "plt.ylabel('Actuel')\n", - "plt.xlabel('Prédit')\n", - "plt.show()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdkAAAFzCAYAAABywHOKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAeIElEQVR4nO3de7hd07n48e+bKEIrEdQlcUmJalGX\ntkpbqi51v5QqitI6jRZ1yqlWb6faOj38Slun1BHXBBUpekRbWkfV5ShCqbiLoJIitETrmmS/vz/W\nTGypJHvvlbHnztzfj2c+a66x5lpzLM968u53jHeOGZmJJEla9AbU3QFJkprKICtJUiEGWUmSCjHI\nSpJUiEFWkqRCDLKSJBWyRN0dmJ+Zz07x2iIt9gattmXdXZAWiVmvTYtSn93uv/dvWfEdxfrWrj4b\nZCVJ/UTH7Lp7UIzDxZIkFWImK0mqV3bU3YNiDLKSpHp1GGQlSSoiG5zJOicrSVIhZrKSpHo5XCxJ\nUiENHi42yEqS6tXg62QNspKkejU4k7XwSZKkQsxkJUn1svBJkqQymnydrEFWklQvM1lJkgppcCZr\n4ZMkqdEi4tyImB4R93RqOz4ipkXEXdW2c6fXvhYRkyPiwYjYoVP7jlXb5Ig4rivnNpOVJNWr/HWy\n5wOnAWPnaf9RZp7cuSEi3g3sB6wPrAb8b0SsW718OrA9MBWYGBETMvO+BZ3YICtJqlfh4eLMvCEi\n1uri4XsA4zLzVeDRiJgMbFa9NjkzpwBExLjq2AUGWYeLJUn16uhoa4uIURFxe6dtVBfPfGRE3F0N\nJy9ftQ0Dnuh0zNSqbX7tC2SQlSQt1jJzdGa+r9M2ugtvOwNYG9gYeBI4pUTfHC6WJNWrhurizHx6\nzn5EnAX8sno6DVi906HDqzYW0D5fZrKSpHq1OVzcExGxaqenHwfmVB5PAPaLiKUiYgQwErgNmAiM\njIgREbEkreKoCQs7j5msJKlWmWWriyPiYmBrYMWImAp8G9g6IjYGEngMOKzVl7w3IsbTKmiaBRyR\nVQcj4kjgN8BA4NzMvHeh587MRf6FFoWZz07pmx2TumHQalvW3QVpkZj12rQo9dmv3PXLtv69X3rj\nXYv1rV0OF0uSVIjDxZKkerl2sSRJhTR47WKDrCSpXuWXVayNQVaSVK8GZ7IWPkmSVIiZrCSpXhY+\nSZJUSIOHiw2ykqR6NTiTdU5WkqRCzGQlSfVqcCZrkJUk1ar0DQLqZJCVJNXLTFaSpEIaXF1s4ZMk\nSYWYyUqS6uVwsSRJhTR4uNggK0mql5msJEmFNDiTtfBJkqRCzGQlSfVyuFiSpEIMspIkFeKcrCRJ\n6i4zWUlSvRwuliSpkAYPFxtkJUn1MpOVJKmQBmeyFj5JklSImawkqV4OF0uSVIhBVpKkQjLr7kEx\nBllJUr0anMla+CRJUiFmspKkejU4kzXISpLq1eDrZA2ykqR6NTiTdU5WkqRCzGQlSfXyEh5Jkgpp\n8HCxQVaSVC+DrCRJhTS4utjCJ0mSCjGTlSTVKjssfJIkqQznZCVJKqTBc7IGWUlSvRo8XGzhkyRJ\nhZjJSpLq5ZysJEmFGGQlSSqkwWsXOycrSVIhBtmG+eb3f8hWu+zHngd+fm7b6edcyDZ7HMjeBx/B\n3gcfwQ033wbAzJkz+eZ//JCPH/QF9jr4cG77491z3zNz5kyOP+lUdtnvX9ht/89xzXU39fp3kRbm\nrNGn8Jepf+KuO6+tuytqR0dHe9tCRMS5ETE9Iu7p1PaDiHggIu6OiF9ExJBOr30tIiZHxIMRsUOn\n9h2rtskRcVxXvppBtmH23Hl7/vuHJ/xT+0H77sllY07nsjGns9UHNwPg0glXA/CLC87grB9/n5NP\nO4uO6gd75phxDF1+CL8adzZXXHQm79tkw977ElIXjR07nl12PaDubqhdHdnetnDnAzvO03YNsEFm\nvgd4CPgaQES8G9gPWL96z08jYmBEDAROB3YC3g3sXx27QAbZhnnfxhsyeLm3denYRx77M5u9dyMA\nVlh+CG9767Lc+8DDAPziV7/lXw7aF4ABAwaw/JDBZTosteHGm27lb889X3c31K7saG9b2Mdn3gD8\nbZ6232bmrOrpLcDwan8PYFxmvpqZjwKTgc2qbXJmTsnM14Bx1bELVLzwKSLeA6zV+VyZeXnp8+qN\nLr7sSiZcfS3rrzeSY4/8HIOXexvvXGcEv7/pFnbebmuemv4M9z04maeefoY1Vx8GwGlnjWXinXez\n+rBV+foxh7Pi0OVr/haSGqn+xSg+C1xS7Q+jFXTnmFq1ATwxT/sHFvbBRTPZiDgXOBfYG9it2nZd\nwPGjIuL2iLj97LEXl+xav7Lvx3fhqvHnctn5p7PSCkP5wWlnAfDxXXZg5ZVWZN9Dj+KkU89k4w3e\nxYCBA5g9ezZPT3+WjTd8Fz8/7zQ22uBdnHza2TV/C0l6c51jR7WN6sZ7vwHMAi4q0bfSmezmmbnQ\nMes5MnM0MBpg5rNTav/Tpik6Z6Cf2H0njjj22wAsscRAvvqvh8197YDDjmGt1YcxZPByDFp6Kbb7\nyIcA+NhHt+TyK3/Tu52W1G9km9fJdo4d3RERh9BK/LbNnHsd0TRg9U6HDa/aWED7fJWek/1DVyaG\nVdYzz74+FXHt9TezzjvWBODlV17hpZdfAeDm2/7IEgMHsvaINYkIPvKhDzDxzla18a2338XaI9bo\n/Y5L6h/KFz79k4jYEfgKsHtmvtTppQnAfhGxVESMAEYCtwETgZERMSIilqRVHDVhYecpncmOpRVo\nnwJeBQLIqppLBRz77ROZeOfdPP/8C2y754EcfuhBTLzzbh58eAoEDFtlZb79laMA+NtzMzjs6G8Q\nAwaw8kor8J///uW5n3PM4Z/la989mRNPPZOhQwZzwtePqesrSfN14QWn85GttmDFFYfy2JTb+c53\nT+a888fV3S11V+G78ETExcDWwIoRMRX4Nq1q4qWAayIC4JbM/Hxm3hsR44H7aA0jH5GZs6vPORL4\nDTAQODcz713oubPgShsRMRk4BpgEzP2/mJmPL+y9DherCQattmXdXZAWiVmvTYtSn/3iCQe29e/9\nst+8sFjf2lU6k30mMxeaTkuS+rH6q4uLKR1k74yInwFX0houBryER5LUiTcI6LFBtILrxzq1JWCQ\nlSS1mMn2TGZ+puTnS5IaoHDhU52KBtmIWBo4lNYakEvPac/Mz5Y8ryRJfUHp62QvAFYBdgCup3Xx\n7t8Ln1OStDip4TrZ3lJ6TnadzNwnIvbIzDFVEdSNhc8pSVqMtLviU19WOsjOrB6fj4gNgKeAtxc+\npyRpcdLHs9F2lA6yoyNieeCbtJafeivwrcLnlCQtTgyyPXYBrTvwrAWMqdpWLnxOSZL6hNJB9gpg\nBnAHnRajkCRpLi/h6bHhmblj4XNIkhZnDhf32M0RsWFmTip8HknSYioNst0TEZNoLZ+4BPCZiJiC\nt7qTJPUzpTLZXQt9riSpacxku6cr94uVJAnwLjySJBVjJitJUiENDrKlbxAgSVK/ZSYrSapVZnMz\nWYOsJKleDR4uNshKkuplkJUkqYwmr/hk4ZMkSYWYyUqS6tXgTNYgK0mqV3MXfDLISpLq5ZysJEnq\nNjNZSVK9GpzJGmQlSfVyTlaSpDKaPCdrkJUk1avBmayFT5IkFWImK0mqlcPFkiSV0uDhYoOsJKlW\naZCVJKmQBgdZC58kSSrETFaSVCuHiyVJKsUgK0lSGU3OZJ2TlSSpEDNZSVKtmpzJGmQlSbUyyEqS\nVEpG3T0oxiArSapVkzNZC58kSSrETFaSVKvscLhYkqQimjxcbJCVJNUqLXySJKmMJmeyFj5JklSI\nmawkqVZNLnwyk5Uk1Sqzva0rIuJfI+KeiLg3Ir5UtQ2NiGsi4uHqcfmqPSLivyJickTcHRGb9vS7\nGWQlSbXKjmhrW5iI2AD4HLAZsBGwa0SsAxwHXJuZI4Frq+cAOwEjq20UcEZPv5tBVpLUdO8Cbs3M\nlzJzFnA9sBewBzCmOmYMsGe1vwcwNltuAYZExKo9ObFBVpJUq3Yz2YgYFRG3d9pGzXOKe4AtI2KF\niFgG2BlYHVg5M5+sjnkKWLnaHwY80en9U6u2brPwSZJUq67Oq87//TkaGL2A1++PiJOA3wIvAncB\ns+c5JiOizZ78MzNZSVKtSs/JAmTmOZn53szcCngOeAh4es4wcPU4vTp8Gq1Md47hVVu3GWQlSbXK\njLa2roiIt1ePa9Caj/0ZMAE4uDrkYOCKan8C8OmqynhzYEanYeVucbhYktQfXBYRKwAzgSMy8/mI\nOBEYHxGHAo8Dn6yO/TWtedvJwEvAZ3p6UoOsJKlWvbGsYmZu+SZtfwW2fZP2BI5YFOc1yEqSatXh\nDQIkSSqjX96FJyKuBOZbzpyZuxfpkSSpX2ny2sULymRP7rVeSJLUQPMNspl5/Zz9iBgErJGZD/ZK\nryRJ/Ua7i1H0ZQu9TjYidqO1OsbV1fONI2JC6Y5JkvqH3liMoi5dKXw6ntadC34PkJl3RcSIgn2S\nJPUjTa4u7sqKTzMzc8Y8bQ1O7iVJWjS6ksneGxGfAgZGxEjgKODmst2SJPUXTb6EpyuZ7BeB9YFX\ngYuBF4AvleyUJKn/yGxv68sWmslm5kvAN6pNkqRFqslzsgsNshFxHW8yB5uZ2xTpkSSpX2nycHFX\n5mS/3Gl/aWBvYFaZ7kiS1BxdGS6+Y56m/4uI2wr1R5LUz/T1edV2dGW4eGinpwOA9wKDi/Wosuyw\nrUqfQipukxXXrrsLUp/Xr+dkgTtozckGrWHiR4FDS3ZKktR/9Pc52Xdl5iudGyJiqUL9kST1M03O\nZLtyneybLTzxh0XdEUmSmmZB95NdBRgGDIqITWgNFwMsByzTC32TJPUDDa57WuBw8Q7AIcBw4BRe\nD7IvAF8v2y1JUn/R5OHiBd1PdgwwJiL2zszLerFPkqR+pMmFT12Zk31vRAyZ8yQilo+IEwr2SZKk\nRuhKkN0pM5+f8yQznwN2LtclSVJ/0tHm1pd15RKegRGxVGa+ChARgwAv4ZEkLRJJc4eLuxJkLwKu\njYjzaBU/HQKMKdkpSVL/0dHg8uKurF18UkT8CdiOVqX1b4A1S3dMktQ/dDQ4k+3KnCzA07QC7D7A\nNsD9xXokSVJDLGgxinWB/avtWeASIDLzo73UN0lSP9Bf52QfAG4Eds3MyQARcXSv9EqS1G/09Qrh\ndixouHgv4Enguog4KyK2hQb/uSFJqkUSbW192XyDbGb+T2buB6wHXAd8CXh7RJwRER/rrQ5KkrS4\nWmjhU2a+mJk/y8zdaK1jfCfw1eI9kyT1C/19MYq5qtWeRlebJElt6+uBsh3dCrKSJC1qfX1etR0G\nWUlSrTqaG2O7vBiFJEnqJjNZSVKtmrysokFWklSrBt8fwCArSaqX1cWSJBXSEc0dLrbwSZKkQsxk\nJUm1ck5WkqRCnJOVJKkQF6OQJEndZiYrSaqVi1FIklSIhU+SJBXS5DlZg6wkqVZNri628EmSpELM\nZCVJtXJOVpKkQpo8J+twsSSpVh1tbl0REUMi4tKIeCAi7o+ILSJiaERcExEPV4/LV8dGRPxXREyO\niLsjYtOefjeDrCSpVr0RZIFTgaszcz1gI+B+4Djg2swcCVxbPQfYCRhZbaOAM3r63QyykqRGi4jB\nwFbAOQCZ+VpmPg/sAYypDhsD7Fnt7wGMzZZbgCERsWpPzm2QlSTVKqO9LSJGRcTtnbZR85xiBPAM\ncF5E3BkRZ0fEssDKmflkdcxTwMrV/jDgiU7vn1q1dZuFT5KkWrV7nWxmjgZGL+CQJYBNgS9m5q0R\ncSqvDw3P+YyMiEVe6GwmK0mqVS/MyU4FpmbmrdXzS2kF3afnDANXj9Or16cBq3d6//CqrdsMspKk\nRsvMp4AnIuKdVdO2wH3ABODgqu1g4IpqfwLw6arKeHNgRqdh5W5xuFiSVKteWozii8BFEbEkMAX4\nDK1Ec3xEHAo8DnyyOvbXwM7AZOCl6tgeMchKkmrVG4tRZOZdwPve5KVt3+TYBI5YFOc1yEqSatXk\nGwQYZCVJtWpykLXwSZKkQsxkJUm18i48kiQV0uS78BhkJUm1avKcrEFWklSrJg8XW/gkSVIhZrKS\npFp1NDiXNchKkmrlnKwkSYU0N491TlaSpGLMZCVJtXK4WJKkQlyMQpKkQqwuliSpkOaGWAufJEkq\nxkxWklQrC58kSSrEOVlJkgppbog1yEqSatbk4WILnyRJKsRMVpJUK+dkJUkqpLkh1iArSaqZc7KS\nJKnbzGQlSbXKBg8YG2QlSbVq8nCxQVaSVCuriyVJKqS5IdbCJ0mSijGT7SdGn3kyO++8Hc888yyb\nbLodABdd+FPWXXdtAAYPXo4ZM17g/ZvtUGc3pX+y8mpv5/hTv87QlYZCJr+48ErGnXMpR33rC2y5\n/QeZ+dospj4+je8efSL/eOEfrDp8FcZffwF/nvJnACbdcR8nHndKzd9CC+JwsRZ7Yy/4OT8943zO\nO/fHc9sOOPDwufsnnfQtXpjx9zq6Ji3QrFmz+fF3f8qDkx5imWUHMfbqs7n1honcesPtnP790cye\nPZsjv/F5DvnigZz2H/8NwLTHp3HA9ofW3HN1VZMLnxwu7iduuulWnnvu+fm+/om9d+OS8Vf0Yo+k\nrvnr9L/y4KSHAHjpxZd5bPLjrLTqStx6/URmz54NwD133MvKq65UZzfVhmzzv76saCYbEQOBXYC1\nOp8rM39Y8rzqng9/+ANMn/4Mkyc/WndXpAVadfgqvHODkdz7x/ve0L77/jtzzRW/m/t8tTVW5cLf\nns2Lf3+JM046m7tuu7u3u6puaHImW3q4+ErgFWASXfj/GBGjgFEAAwcOYcDAZcv2TgDsu+8eZrHq\n8wYtM4iTzv4eP/z3n/DiP16a2/6Zow5i1qzZXHX5NQA8O/2v7Pb+fZjx3Aust+G6nHze99l360+/\n4T1SbykdZIdn5nu6enBmjgZGAyy51PC+PQbQEAMHDmTPPXZi8y12rrsr0nwNXGIgJ539Pa6+/Bqu\nu+qGue27fnJHPrzdFhy+79Fz22a+NpMZr80E4IFJDzH1sWms8Y7Vuf/uB3u93+qavj7k247Sc7JX\nRcTHCp9Dbdh22y158MFHmDbtybq7Is3Xt075Ko89/Dg/Gz1+btsWW2/GQYd/in875Gu8+vKrc9uH\nDB3MgAGtf9qGrbEqq48YzrQ//6XX+6yu62hz68tKZ7K3AL+IiAHATCCAzMzlCp9X87hg7GlstdUW\nrLjiUKY8MpHvfu8Uzj9/HJ/cZ3cuGf8/dXdPmq+NNtuQXfbZkYfve4SLrjkHgNP/8yy+/L2jWHKp\nJTn9klaJx5xLdTbZfGM+f+xnmTVrFh0dyYnHncILz1s535d1ZHMz2ciCXy4iHgX2ACZlN0/kcLGa\nYKMV3lF3F6RFYuJfbohSn33Qmnu19e/9BY9fXqxv7SqdyT4B3NPdACtJ6j+aHCBKB9kpwO8j4ipg\n7qSJl/BIkuZwxaeee7Talqw2SZLeoMnVxUWDbGZ+p+TnS5IWf329QrgdpVd8Wgn4CrA+sPSc9szc\npuR5JUnqC0pfJ3sR8AAwAvgO8BgwsfA5JUmLkQ6yra0vKx1kV8jMc4CZmXl9Zn4WMIuVJM3lDQJ6\nbmb1+GRE7AL8BRha+JySpMWIc7I9d0JEDAb+DfgJsBxw9ILfIknqT5q8lELp6uJfVrszgI+WPJck\nSX1N0TnZiHhHRFwZEc9GxPSIuCIiXGdOkjSXhU899zNgPLAKsBrwc+DiwueUJC1GmnwXntJBdpnM\nvCAzZ1XbhXS6XlaSpNLVxRGxdETcFhF/ioh7I+I7VfuIiLg1IiZHxCURsWTVvlT1fHL1+lo9/W69\ncT/Z4yJirYhYMyK+Avw6IoZGhFXGkqTe8CqwTWZuBGwM7BgRmwMnAT/KzHWA54BDq+MPBZ6r2n9U\nHdcjpauLP1k9HsbrN1oIYL/qufOzktTPlZ5Xre4E94/q6VuqLWmt2/Cpqn0McDxwBq1btB5ftV8K\nnBYR0ZM7ypXOZL8KbJSZI4DzgD8Be2fmiMw0wEqSyMy2tq6IiIERcRcwHbgGeAR4PjNnVYdMBYZV\n+8No3aqV6vUZwAo9+W6lg+w3M/OFiPgwrb8Yzqb1V4IkSUD7hU8RMSoibu+0jZr3HJk5OzM3BoYD\nmwHrFf9ilB8unl097gKclZm/iogTCp9TkrQYaXdpxMwcDYzu4rHPR8R1wBbAkIhYospWhwPTqsOm\nAasDUyNiCWAw8Nee9K10JjstIs4E9qVV8LRUL5xTkqS5ImKliBhS7Q8CtgfuB64DPlEddjBwRbU/\noXpO9frvejIfC71T+LQjcHL118OqwLGFzylJWoz0woISqwJjImIgrURvfGb+MiLuA8ZVI6x3AudU\nx58DXBARk4G/0SrW7ZHSyyq+BFze6fmTwJMlzylJWryUXrs4M+8GNnmT9im05mfnbX8F2GdRnLt0\nJitJ0gL19aUR2+H8qCRJhZjJSpJq1ddvvN4Og6wkqVYd3k9WkqQymhtiDbKSpJpZ+CRJkrrNTFaS\nVKsmZ7IGWUlSrUovRlEng6wkqVZmspIkFdLk62QtfJIkqRAzWUlSrZyTlSSpEOdkJUkqpMmZrHOy\nkiQVYiYrSaqVw8WSJBXS5Et4DLKSpFp5qztJkgppciZr4ZMkSYWYyUqSauVwsSRJhTR5uNggK0mq\nlZmsJEmFNDmTtfBJkqRCzGQlSbVyuFiSpEKaPFxskJUk1Sqzo+4uFOOcrCRJhZjJSpJq5V14JEkq\npMk3bTfISpJqZSYrSVIhTc5kLXySJKkQM1lJUq1cjEKSpEJcjEKSpEKaPCdrkJUk1arJ1cUWPkmS\nVIiZrCSpVg4XS5JUiNXFkiQV0uRM1jlZSZIKMZOVJNWqydXFBllJUq2aPFxskJUk1crCJ0mSCmny\nsooWPkmSVIiZrCSpVg4XS5JUiIVPkiQV0uQ5WYOsJKlWTc5kLXySJKkQM1lJUq2anMkaZCVJtWpu\niIVo8l8QWrCIGJWZo+vuh9Quf8vqq5yT7d9G1d0BaRHxt6w+ySArSVIhBllJkgoxyPZvzmGpKfwt\nq0+y8EmSpELMZCVJKsQg20ARsVZE3FN3PySpvzPISpJUiEG2uQZGxFkRcW9E/DYiBkXE5yJiYkT8\nKSIui4hlACLi/Ig4IyJuiYgpEbF1RJwbEfdHxPk1fw/1MxGxbET8qvqd3hMR+0bEYxHx/yJiUkTc\nFhHrVMfuFhG3RsSdEfG/EbFy1X58RIyJiBsj4vGI2KvT+6+OiLfU+y3VXxhkm2skcHpmrg88D+wN\nXJ6Z78/MjYD7gUM7Hb88sAVwNDAB+BGwPrBhRGzcqz1Xf7cj8JfM3CgzNwCurtpnZOaGwGnAj6u2\nm4DNM3MTYBzwlU6fszawDbA7cCFwXfX+l4Fdyn8NySDbZI9m5l3V/h3AWsAG1V/2k4ADaAXROa7M\nVqn5JODpzJyUmR3AvdV7pd4yCdg+Ik6KiC0zc0bVfnGnxy2q/eHAb6rf9LG88Td9VWbOrD5vIK8H\n60n4m1YvMcg216ud9mfTuhnE+cCR1V/z3wGWfpPjO+Z5bwfeSEK9KDMfAjalFQxPiIh/n/NS58Oq\nx58Ap1W/6cN4k9909cfizHz9ekV/0+o1Btn+5W3Ak9V81AF1d0Z6MxGxGvBSZl4I/IBWwAXYt9Pj\nH6r9wcC0av/gXuuk1EX+Nde/fAu4FXimenxbvd2R3tSGwA8iogOYCXwBuBRYPiLuppWh7l8dezzw\n84h4DvgdMKL3uyvNnys+SerzIuIx4H2Z+WzdfZG6w+FiSZIKMZOVJKkQM1lJkgoxyEqSVIhBViok\nItaPiN3r7oek+hhkpS6KiNkRcVe1nu7P56z9PJ9j1wC+Afx+Pq9vHRG/rPZ3j4jjqv09I+LdBbov\nqQYGWanrXs7Mjav1dF8DPt/5xWgZAJCZf87MT2XmCwv70MyckJknVk/3BAyyUkMYZKWeuRFYp7p3\n74MRMRa4B1g9Ij4WEX+IiD9WGe9bASJix4h4ICL+COw154Mi4pCIOC0iPkhrMfsfVBnz2nV8MUmL\njkFW6qaIWALYidbautC649FPqzsevQh8E9guMzcFbgeOiYilgbOA3YD3AqvM+7mZeTOtOyAdW2XM\njxT/MpKKcllFqesGRcScOxvdCJwDrAY8npm3VO2b0xru/b+IAFiS1jq769G6M9LDABFxITCqF/su\nqQYGWanrXs7MN9xbtwqkL3ZuAq7JzP3nOc578kr9kMPF0qJ1C/ChiFgHICKWjYh1gQeAtTrNs+4/\nn/f/HW/cIDWGQVZahDLzGeAQ4OLqjjF/ANbLzFdoDQ//qip8mj6fjxgHHBsRd1r4JC3+XLtYkqRC\nzGQlSSrEICtJUiEGWUmSCjHISpJUiEFWkqRCDLKSJBVikJUkqRCDrCRJhfx/YJWjuRmq0BwAAAAA\nSUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "s2_y66pe5hAa" - }, - "source": [ - "On remarque un petit peu d'amélioration, alors SVM est un bon choix pour ce cas de dataSet " - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "outputId": "2112fb86-7a3b-4cce-8dca-a7447e844980", - "id": "m_oAPb_V5hAb", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 619 - } - }, - "source": [ - "from IPython.display import display\n", - "\n", - "for predicted in category_id_df.category_id:\n", - " for actual in category_id_df.category_id:\n", - " if predicted != actual and conf_mat[actual, predicted] >= 6:\n", - " print(\"'{}' est prédit comme '{}' : {} exemples.\".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))\n", - " display(df_lem.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['Category', 'Message_lemmatized_before_stemming']])\n", - " print('')" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "'spam' est prédit comme 'ham' : 17 exemples.\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CategoryMessage_lemmatized_before_stemming
684spamHi I 'm sue . I 20 year old work lapdancer . I love sex . Text live - I 'm bedroom . text SUE 89555 . By TextOperator G2 1DA 150ppmsg 18+
731spamEmail AlertFrom : Jeri StewartSize : 2KBSubject : Low-cost prescripiton drvgsTo listen email call 123
1940spamMore people dogging area . Call 09090204448 join like minded guy . Why arrange 1 . There 's 1 evening . A£1.50 minAPN LS278BB
751spamDo realize 40 year , 'll thousand old lady running around tattoo ?
4213spamMissed call alert . These number called left message . 07008009200
4298spamthesmszone.com let send free anonymous masked messages..im sending message there..do see potential abuse ? ? ?
333spamCall Germany 1 penny per minute ! Call fixed line via access number 0844 861 85 85 . No prepayment . Direct access !
3864spamOh god ! I 've found number ! I 'm glad , text back xafter msg cst std ntwk chg £1.50
5449spamLatest News ! Police station toilet stolen , cop nothing go !
1430spamFor sale - arsenal dartboard . Good condition double treble !
2247spamHi ya babe x u 4goten bout ? ' scammer getting smart..Though regular vodafone , respond get prem rate msg/subscription . Other no used also . Beware !
3991spam( Bank Granite issue Strong-Buy ) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300 % *********** Nasdaq Symbol CDGT That $ 5.00 per..
3460spamNot heard U4 . Call night knickers . Make beg like U last time 01223585236 XX Luv Nikiyu4.net
2823spamROMCAPspam Everyone around responding well presence since warm outgoing . You bringing real breath sunshine .
2402spamBabe : U want dont u baby ! Im nasty thing 4 filthyguys . Fancy rude time sexy bitch . How go slo n hard ! Txt XXX SLO ( 4msgs )
3501spamDorothy @ kiefer.com ( Bank Granite issue Strong-Buy ) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300 % *********** Nasdaq Symbol CDGT That $ 5.00 per..
4821spamCheck Out Choose Your Babe Videos @ sms.shsex.netUN fgkslpoPW fgkslpo
\n", - "
" - ], - "text/plain": [ - " Category Message_lemmatized_before_stemming\n", - "684 spam Hi I 'm sue . I 20 year old work lapdancer . I love sex . Text live - I 'm bedroom . text SUE 89555 . By TextOperator G2 1DA 150ppmsg 18+ \n", - "731 spam Email AlertFrom : Jeri StewartSize : 2KBSubject : Low-cost prescripiton drvgsTo listen email call 123 \n", - "1940 spam More people dogging area . Call 09090204448 join like minded guy . Why arrange 1 . There 's 1 evening . A£1.50 minAPN LS278BB \n", - "751 spam Do realize 40 year , 'll thousand old lady running around tattoo ? \n", - "4213 spam Missed call alert . These number called left message . 07008009200 \n", - "4298 spam thesmszone.com let send free anonymous masked messages..im sending message there..do see potential abuse ? ? ? \n", - "333 spam Call Germany 1 penny per minute ! Call fixed line via access number 0844 861 85 85 . No prepayment . Direct access ! \n", - "3864 spam Oh god ! I 've found number ! I 'm glad , text back xafter msg cst std ntwk chg £1.50 \n", - "5449 spam Latest News ! Police station toilet stolen , cop nothing go ! \n", - "1430 spam For sale - arsenal dartboard . Good condition double treble ! \n", - "2247 spam Hi ya babe x u 4goten bout ? ' scammer getting smart..Though regular vodafone , respond get prem rate msg/subscription . Other no used also . Beware ! \n", - "3991 spam ( Bank Granite issue Strong-Buy ) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300 % *********** Nasdaq Symbol CDGT That $ 5.00 per.. \n", - "3460 spam Not heard U4 . Call night knickers . Make beg like U last time 01223585236 XX Luv Nikiyu4.net \n", - "2823 spam ROMCAPspam Everyone around responding well presence since warm outgoing . You bringing real breath sunshine . \n", - "2402 spam Babe : U want dont u baby ! Im nasty thing 4 filthyguys . Fancy rude time sexy bitch . How go slo n hard ! Txt XXX SLO ( 4msgs ) \n", - "3501 spam Dorothy @ kiefer.com ( Bank Granite issue Strong-Buy ) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300 % *********** Nasdaq Symbol CDGT That $ 5.00 per..\n", - "4821 spam Check Out Choose Your Babe Videos @ sms.shsex.netUN fgkslpoPW fgkslpo " - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab_type": "code", - "id": "gzgUQ8ml5hAe", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] - } - ] -} \ No newline at end of file