{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "d1 = 'the man went out for a walk'\n",
    "d2 = 'the children sat around the fire'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "bow1 = d1.split()\n",
    "bow2 = d2.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['the', 'man', 'went', 'out', 'for', 'a', 'walk'] ['the', 'children', 'sat', 'around', 'the', 'fire']\n"
     ]
    }
   ],
   "source": [
    "print(bow1, bow2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'fire', 'went', 'a', 'out', 'children', 'walk', 'man', 'sat', 'around', 'for', 'the'}\n"
     ]
    }
   ],
   "source": [
    "unique_words = set(bow1).union(set(bow2))\n",
    "\n",
    "print(unique_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "dist_of_words1 = dict.fromkeys(unique_words, 0)\n",
    "\n",
    "for word in bow1:\n",
    "    dist_of_words1[word] += 1\n",
    "    \n",
    "dist_of_words2 = dict.fromkeys(unique_words, 0)\n",
    "for word in bow2:\n",
    "    dist_of_words2[word] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'fire': 0, 'went': 1, 'a': 1, 'out': 1, 'children': 0, 'walk': 1, 'man': 1, 'sat': 0, 'around': 0, 'for': 1, 'the': 1}\n",
      "{'fire': 1, 'went': 0, 'a': 0, 'out': 0, 'children': 1, 'walk': 0, 'man': 0, 'sat': 1, 'around': 1, 'for': 0, 'the': 2}\n"
     ]
    }
   ],
   "source": [
    "print(dist_of_words1)\n",
    "print(dist_of_words2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['i',\n",
       " 'me',\n",
       " 'my',\n",
       " 'myself',\n",
       " 'we',\n",
       " 'our',\n",
       " 'ours',\n",
       " 'ourselves',\n",
       " 'you',\n",
       " \"you're\",\n",
       " \"you've\",\n",
       " \"you'll\",\n",
       " \"you'd\",\n",
       " 'your',\n",
       " 'yours',\n",
       " 'yourself',\n",
       " 'yourselves',\n",
       " 'he',\n",
       " 'him',\n",
       " 'his',\n",
       " 'himself',\n",
       " 'she',\n",
       " \"she's\",\n",
       " 'her',\n",
       " 'hers',\n",
       " 'herself',\n",
       " 'it',\n",
       " \"it's\",\n",
       " 'its',\n",
       " 'itself',\n",
       " 'they',\n",
       " 'them',\n",
       " 'their',\n",
       " 'theirs',\n",
       " 'themselves',\n",
       " 'what',\n",
       " 'which',\n",
       " 'who',\n",
       " 'whom',\n",
       " 'this',\n",
       " 'that',\n",
       " \"that'll\",\n",
       " 'these',\n",
       " 'those',\n",
       " 'am',\n",
       " 'is',\n",
       " 'are',\n",
       " 'was',\n",
       " 'were',\n",
       " 'be',\n",
       " 'been',\n",
       " 'being',\n",
       " 'have',\n",
       " 'has',\n",
       " 'had',\n",
       " 'having',\n",
       " 'do',\n",
       " 'does',\n",
       " 'did',\n",
       " 'doing',\n",
       " 'a',\n",
       " 'an',\n",
       " 'the',\n",
       " 'and',\n",
       " 'but',\n",
       " 'if',\n",
       " 'or',\n",
       " 'because',\n",
       " 'as',\n",
       " 'until',\n",
       " 'while',\n",
       " 'of',\n",
       " 'at',\n",
       " 'by',\n",
       " 'for',\n",
       " 'with',\n",
       " 'about',\n",
       " 'against',\n",
       " 'between',\n",
       " 'into',\n",
       " 'through',\n",
       " 'during',\n",
       " 'before',\n",
       " 'after',\n",
       " 'above',\n",
       " 'below',\n",
       " 'to',\n",
       " 'from',\n",
       " 'up',\n",
       " 'down',\n",
       " 'in',\n",
       " 'out',\n",
       " 'on',\n",
       " 'off',\n",
       " 'over',\n",
       " 'under',\n",
       " 'again',\n",
       " 'further',\n",
       " 'then',\n",
       " 'once',\n",
       " 'here',\n",
       " 'there',\n",
       " 'when',\n",
       " 'where',\n",
       " 'why',\n",
       " 'how',\n",
       " 'all',\n",
       " 'any',\n",
       " 'both',\n",
       " 'each',\n",
       " 'few',\n",
       " 'more',\n",
       " 'most',\n",
       " 'other',\n",
       " 'some',\n",
       " 'such',\n",
       " 'no',\n",
       " 'nor',\n",
       " 'not',\n",
       " 'only',\n",
       " 'own',\n",
       " 'same',\n",
       " 'so',\n",
       " 'than',\n",
       " 'too',\n",
       " 'very',\n",
       " 's',\n",
       " 't',\n",
       " 'can',\n",
       " 'will',\n",
       " 'just',\n",
       " 'don',\n",
       " \"don't\",\n",
       " 'should',\n",
       " \"should've\",\n",
       " 'now',\n",
       " 'd',\n",
       " 'll',\n",
       " 'm',\n",
       " 'o',\n",
       " 're',\n",
       " 've',\n",
       " 'y',\n",
       " 'ain',\n",
       " 'aren',\n",
       " \"aren't\",\n",
       " 'couldn',\n",
       " \"couldn't\",\n",
       " 'didn',\n",
       " \"didn't\",\n",
       " 'doesn',\n",
       " \"doesn't\",\n",
       " 'hadn',\n",
       " \"hadn't\",\n",
       " 'hasn',\n",
       " \"hasn't\",\n",
       " 'haven',\n",
       " \"haven't\",\n",
       " 'isn',\n",
       " \"isn't\",\n",
       " 'ma',\n",
       " 'mightn',\n",
       " \"mightn't\",\n",
       " 'mustn',\n",
       " \"mustn't\",\n",
       " 'needn',\n",
       " \"needn't\",\n",
       " 'shan',\n",
       " \"shan't\",\n",
       " 'shouldn',\n",
       " \"shouldn't\",\n",
       " 'wasn',\n",
       " \"wasn't\",\n",
       " 'weren',\n",
       " \"weren't\",\n",
       " 'won',\n",
       " \"won't\",\n",
       " 'wouldn',\n",
       " \"wouldn't\"]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "\n",
    "stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drugi primer normalizacije: u odnosu na broj reci u BOW\n",
    "def compute_TF(word_dict, bow):\n",
    "    tf_dict = {}\n",
    "    bow_count = len(bow)\n",
    "    \n",
    "    for word, count in word_dict.items():\n",
    "        tf_dict[word] = count / bow_count\n",
    "        \n",
    "    return tf_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf1 = compute_TF(dist_of_words1, bow1)\n",
    "tf2 = compute_TF(dist_of_words2, bow2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'fire': 0.0, 'went': 0.14285714285714285, 'a': 0.14285714285714285, 'out': 0.14285714285714285, 'children': 0.0, 'walk': 0.14285714285714285, 'man': 0.14285714285714285, 'sat': 0.0, 'around': 0.0, 'for': 0.14285714285714285, 'the': 0.14285714285714285}\n",
      "{'fire': 0.16666666666666666, 'went': 0.0, 'a': 0.0, 'out': 0.0, 'children': 0.16666666666666666, 'walk': 0.0, 'man': 0.0, 'sat': 0.16666666666666666, 'around': 0.16666666666666666, 'for': 0.0, 'the': 0.3333333333333333}\n"
     ]
    }
   ],
   "source": [
    "print(tf1)\n",
    "print(tf2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_IDF(documents):\n",
    "    \n",
    "    import math\n",
    "    N = len(documents)\n",
    "    \n",
    "    idf_dict = dict.fromkeys(documents[0].keys(), 0)\n",
    "    \n",
    "    for document in documents:\n",
    "        for word, val in document.items():\n",
    "            if val > 0:\n",
    "                idf_dict[word] += 1\n",
    "    \n",
    "    for word, val in idf_dict.items():\n",
    "        idf_dict[word] = math.log(N / val)\n",
    "        \n",
    "    return idf_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'fire': 0.6931471805599453, 'went': 0.6931471805599453, 'a': 0.6931471805599453, 'out': 0.6931471805599453, 'children': 0.6931471805599453, 'walk': 0.6931471805599453, 'man': 0.6931471805599453, 'sat': 0.6931471805599453, 'around': 0.6931471805599453, 'for': 0.6931471805599453, 'the': 0.0}\n"
     ]
    }
   ],
   "source": [
    "idfs = compute_IDF([dist_of_words1, dist_of_words2])\n",
    "\n",
    "print(idfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_TFIDF(tf_bow, idfs):\n",
    "    tfidf = {}\n",
    "    for word, val in tf_bow.items():\n",
    "        tfidf[word] = val * idfs[word]\n",
    "        \n",
    "    return tfidf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          a    around  children      fire       for       man       out  \\\n",
      "0  0.099021  0.000000  0.000000  0.000000  0.099021  0.099021  0.099021   \n",
      "1  0.000000  0.115525  0.115525  0.115525  0.000000  0.000000  0.000000   \n",
      "\n",
      "        sat  the      walk      went  \n",
      "0  0.000000  0.0  0.099021  0.099021  \n",
      "1  0.115525  0.0  0.000000  0.000000  \n"
     ]
    }
   ],
   "source": [
    "tfidf1 = compute_TFIDF(tf1, idfs)\n",
    "tfidf2 = compute_TFIDF(tf2, idfs)\n",
    "\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame([tfidf1, tfidf2])\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     around  children      fire      for      man      out       sat  \\\n",
      "0  0.000000  0.000000  0.000000  0.42616  0.42616  0.42616  0.000000   \n",
      "1  0.407401  0.407401  0.407401  0.00000  0.00000  0.00000  0.407401   \n",
      "\n",
      "        the     walk     went  \n",
      "0  0.303216  0.42616  0.42616  \n",
      "1  0.579739  0.00000  0.00000  \n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "vectorizer = TfidfVectorizer()\n",
    "vectors = vectorizer.fit_transform([d1, d2])\n",
    "\n",
    "feature_names = vectorizer.get_feature_names()\n",
    "dense = vectors.todense()\n",
    "denselist = dense.tolist()\n",
    "df = pd.DataFrame(denselist, columns=feature_names)\n",
    "\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76\n",
    "\n",
    "The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations. \n",
    "In an example with more text, the score for the word the would be greatly reduced."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}