{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "d1 = 'the man went out for a walk'\n", "d2 = 'the children sat around the fire'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "bow1 = d1.split()\n", "bow2 = d2.split()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['the', 'man', 'went', 'out', 'for', 'a', 'walk'] ['the', 'children', 'sat', 'around', 'the', 'fire']\n" ] } ], "source": [ "print(bow1, bow2)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'fire', 'went', 'a', 'out', 'children', 'walk', 'man', 'sat', 'around', 'for', 'the'}\n" ] } ], "source": [ "unique_words = set(bow1).union(set(bow2))\n", "\n", "print(unique_words)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "dist_of_words1 = dict.fromkeys(unique_words, 0)\n", "\n", "for word in bow1:\n", " dist_of_words1[word] += 1\n", " \n", "dist_of_words2 = dict.fromkeys(unique_words, 0)\n", "for word in bow2:\n", " dist_of_words2[word] += 1" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'fire': 0, 'went': 1, 'a': 1, 'out': 1, 'children': 0, 'walk': 1, 'man': 1, 'sat': 0, 'around': 0, 'for': 1, 'the': 1}\n", "{'fire': 1, 'went': 0, 'a': 0, 'out': 0, 'children': 1, 'walk': 0, 'man': 0, 'sat': 1, 'around': 1, 'for': 0, 'the': 2}\n" ] } ], "source": [ "print(dist_of_words1)\n", "print(dist_of_words2)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['i',\n", " 'me',\n", " 'my',\n", " 'myself',\n", " 'we',\n", " 'our',\n", " 'ours',\n", " 'ourselves',\n", " 'you',\n", " \"you're\",\n", " \"you've\",\n", " \"you'll\",\n", " \"you'd\",\n", " 'your',\n", " 'yours',\n", " 'yourself',\n", " 'yourselves',\n", " 'he',\n", " 'him',\n", " 'his',\n", " 'himself',\n", " 'she',\n", " \"she's\",\n", " 'her',\n", " 'hers',\n", " 'herself',\n", " 'it',\n", " \"it's\",\n", " 'its',\n", " 'itself',\n", " 'they',\n", " 'them',\n", " 'their',\n", " 'theirs',\n", " 'themselves',\n", " 'what',\n", " 'which',\n", " 'who',\n", " 'whom',\n", " 'this',\n", " 'that',\n", " \"that'll\",\n", " 'these',\n", " 'those',\n", " 'am',\n", " 'is',\n", " 'are',\n", " 'was',\n", " 'were',\n", " 'be',\n", " 'been',\n", " 'being',\n", " 'have',\n", " 'has',\n", " 'had',\n", " 'having',\n", " 'do',\n", " 'does',\n", " 'did',\n", " 'doing',\n", " 'a',\n", " 'an',\n", " 'the',\n", " 'and',\n", " 'but',\n", " 'if',\n", " 'or',\n", " 'because',\n", " 'as',\n", " 'until',\n", " 'while',\n", " 'of',\n", " 'at',\n", " 'by',\n", " 'for',\n", " 'with',\n", " 'about',\n", " 'against',\n", " 'between',\n", " 'into',\n", " 'through',\n", " 'during',\n", " 'before',\n", " 'after',\n", " 'above',\n", " 'below',\n", " 'to',\n", " 'from',\n", " 'up',\n", " 'down',\n", " 'in',\n", " 'out',\n", " 'on',\n", " 'off',\n", " 'over',\n", " 'under',\n", " 'again',\n", " 'further',\n", " 'then',\n", " 'once',\n", " 'here',\n", " 'there',\n", " 'when',\n", " 'where',\n", " 'why',\n", " 'how',\n", " 'all',\n", " 'any',\n", " 'both',\n", " 'each',\n", " 'few',\n", " 'more',\n", " 'most',\n", " 'other',\n", " 'some',\n", " 'such',\n", " 'no',\n", " 'nor',\n", " 'not',\n", " 'only',\n", " 'own',\n", " 'same',\n", " 'so',\n", " 'than',\n", " 'too',\n", " 'very',\n", " 's',\n", " 't',\n", " 'can',\n", " 'will',\n", " 'just',\n", " 'don',\n", " \"don't\",\n", " 'should',\n", " \"should've\",\n", " 'now',\n", " 'd',\n", " 'll',\n", " 'm',\n", " 'o',\n", " 're',\n", " 've',\n", " 'y',\n", " 'ain',\n", " 'aren',\n", " \"aren't\",\n", " 'couldn',\n", " \"couldn't\",\n", " 'didn',\n", " \"didn't\",\n", " 'doesn',\n", " \"doesn't\",\n", " 'hadn',\n", " \"hadn't\",\n", " 'hasn',\n", " \"hasn't\",\n", " 'haven',\n", " \"haven't\",\n", " 'isn',\n", " \"isn't\",\n", " 'ma',\n", " 'mightn',\n", " \"mightn't\",\n", " 'mustn',\n", " \"mustn't\",\n", " 'needn',\n", " \"needn't\",\n", " 'shan',\n", " \"shan't\",\n", " 'shouldn',\n", " \"shouldn't\",\n", " 'wasn',\n", " \"wasn't\",\n", " 'weren',\n", " \"weren't\",\n", " 'won',\n", " \"won't\",\n", " 'wouldn',\n", " \"wouldn't\"]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from nltk.corpus import stopwords\n", "\n", "stopwords.words('english')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# drugi primer normalizacije: u odnosu na broj reci u BOW\n", "def compute_TF(word_dict, bow):\n", " tf_dict = {}\n", " bow_count = len(bow)\n", " \n", " for word, count in word_dict.items():\n", " tf_dict[word] = count / bow_count\n", " \n", " return tf_dict" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "tf1 = compute_TF(dist_of_words1, bow1)\n", "tf2 = compute_TF(dist_of_words2, bow2)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'fire': 0.0, 'went': 0.14285714285714285, 'a': 0.14285714285714285, 'out': 0.14285714285714285, 'children': 0.0, 'walk': 0.14285714285714285, 'man': 0.14285714285714285, 'sat': 0.0, 'around': 0.0, 'for': 0.14285714285714285, 'the': 0.14285714285714285}\n", "{'fire': 0.16666666666666666, 'went': 0.0, 'a': 0.0, 'out': 0.0, 'children': 0.16666666666666666, 'walk': 0.0, 'man': 0.0, 'sat': 0.16666666666666666, 'around': 0.16666666666666666, 'for': 0.0, 'the': 0.3333333333333333}\n" ] } ], "source": [ "print(tf1)\n", "print(tf2)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def compute_IDF(documents):\n", " \n", " import math\n", " N = len(documents)\n", " \n", " idf_dict = dict.fromkeys(documents[0].keys(), 0)\n", " \n", " for document in documents:\n", " for word, val in document.items():\n", " if val > 0:\n", " idf_dict[word] += 1\n", " \n", " for word, val in idf_dict.items():\n", " idf_dict[word] = math.log(N / val)\n", " \n", " return idf_dict" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'fire': 0.6931471805599453, 'went': 0.6931471805599453, 'a': 0.6931471805599453, 'out': 0.6931471805599453, 'children': 0.6931471805599453, 'walk': 0.6931471805599453, 'man': 0.6931471805599453, 'sat': 0.6931471805599453, 'around': 0.6931471805599453, 'for': 0.6931471805599453, 'the': 0.0}\n" ] } ], "source": [ "idfs = compute_IDF([dist_of_words1, dist_of_words2])\n", "\n", "print(idfs)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def compute_TFIDF(tf_bow, idfs):\n", " tfidf = {}\n", " for word, val in tf_bow.items():\n", " tfidf[word] = val * idfs[word]\n", " \n", " return tfidf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install pandas" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " a around children fire for man out \\\n", "0 0.099021 0.000000 0.000000 0.000000 0.099021 0.099021 0.099021 \n", "1 0.000000 0.115525 0.115525 0.115525 0.000000 0.000000 0.000000 \n", "\n", " sat the walk went \n", "0 0.000000 0.0 0.099021 0.099021 \n", "1 0.115525 0.0 0.000000 0.000000 \n" ] } ], "source": [ "tfidf1 = compute_TFIDF(tf1, idfs)\n", "tfidf2 = compute_TFIDF(tf2, idfs)\n", "\n", "\n", "import pandas as pd\n", "\n", "df = pd.DataFrame([tfidf1, tfidf2])\n", "print(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install scikit-learn" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " around children fire for man out sat \\\n", "0 0.000000 0.000000 0.000000 0.42616 0.42616 0.42616 0.000000 \n", "1 0.407401 0.407401 0.407401 0.00000 0.00000 0.00000 0.407401 \n", "\n", " the walk went \n", "0 0.303216 0.42616 0.42616 \n", "1 0.579739 0.00000 0.00000 \n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "vectorizer = TfidfVectorizer()\n", "vectors = vectorizer.fit_transform([d1, d2])\n", "\n", "feature_names = vectorizer.get_feature_names()\n", "dense = vectors.todense()\n", "denselist = dense.tolist()\n", "df = pd.DataFrame(denselist, columns=feature_names)\n", "\n", "print(df)" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76\n", "\n", "The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations. \n", "In an example with more text, the score for the word the would be greatly reduced." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 }