prediction afp mono and balanced

D2KLab · May 15, 2020 · c8881c9 · c8881c9
1 parent 28b0ecf
commit c8881c9
Show file tree

Hide file tree

Showing 8 changed files with 72,332 additions and 4 deletions.
diff --git a/notebooks/afp_balancing.ipynb b/notebooks/afp_balancing.ipynb
@@ -0,0 +1,268 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AFP balanced dataset\n",
+    "I want to extract from this dataset a balanced one respect to the categories in the ground truth"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = '../data/afp.txt'\n",
+    "labels = '../data/afp_labels.txt'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[',,',\n",
+       " '13000000,13007000 13006000 13008000,',\n",
+       " '11000000,11014000 11001000 11002000 11006000,',\n",
+       " '11000000,11014000 11001000 11002000 11006000,',\n",
+       " '11000000,11014000 11001000 11002000 11006000,',\n",
+       " '11000000,11014000 11001000 11002000 11006000,',\n",
+       " '15000000,,',\n",
+       " '11000000,11014000 11001000 11002000 11006000,',\n",
+       " '02000000,02003000 02001000 16001000,02003001',\n",
+       " '15000000,15054000,']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with open(labels, \"r\") as datafile:\n",
+    "    true_topics = [line.rstrip() for line in datafile if line]\n",
+    "true_topics[0:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('',\n",
+       " '13000000',\n",
+       " '11000000',\n",
+       " '11000000',\n",
+       " '11000000',\n",
+       " '11000000',\n",
+       " '15000000',\n",
+       " '11000000',\n",
+       " '02000000',\n",
+       " '15000000')"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "true1, true2, true3 = zip(*[line.split(',') for line in true_topics])\n",
+    "true1[0:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'': 3542,\n",
+       "         '13000000': 419,\n",
+       "         '11000000': 12703,\n",
+       "         '15000000': 34025,\n",
+       "         '02000000': 2161,\n",
+       "         'multiple': 52838,\n",
+       "         '03000000': 1606,\n",
+       "         '10000000': 61,\n",
+       "         '08000000': 260,\n",
+       "         '16000000': 5462,\n",
+       "         '04000000': 9778,\n",
+       "         '12000000': 196,\n",
+       "         '01000000': 1509,\n",
+       "         '06000000': 269,\n",
+       "         '05000000': 13,\n",
+       "         '17000000': 172,\n",
+       "         '07000000': 335,\n",
+       "         '09000000': 8,\n",
+       "         '14000000': 159})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "categories = [x if ' ' not in x else 'multiple' for x in true1]\n",
+    "\n",
+    "from collections import Counter\n",
+    "counts = Counter(categories)\n",
+    "counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(dataset, \"r\") as datafile:\n",
+    "    text = [line.rstrip() for line in datafile if line]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Subset 1: Only mono-label entries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of classes 14\n",
+      "Number of documents 69054\n"
+     ]
+    }
+   ],
+   "source": [
+    "selected_idx = []\n",
+    "num = 0\n",
+    "for c in counts:\n",
+    "    if c == '' or c == 'multiple':\n",
+    "        continue\n",
+    "    if counts[c] < 159:\n",
+    "        continue\n",
+    "    num += 1\n",
+    "    indices = [i for i, x in enumerate(categories) if x == c]\n",
+    "    selected_idx.extend(indices)\n",
+    "\n",
+    "\n",
+    "print('Number of classes %d' % num)\n",
+    "print('Number of documents %d' % len(selected_idx))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_and_write(name, dataset, idx):\n",
+    "    with open(name, 'w') as f:\n",
+    "        for i in idx:\n",
+    "            f.write(dataset[i])\n",
+    "            f.write('\\n')\n",
+    "            \n",
+    "extract_and_write('../data/afp_mono.txt', text, selected_idx)\n",
+    "extract_and_write('../data/afp_mono_label.txt', true1, selected_idx)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Subset 2: Only mono-label entries, balanced"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of classes 14\n",
+      "Number of documents 2226\n"
+     ]
+    }
+   ],
+   "source": [
+    "from random import choices\n",
+    "\n",
+    "balanced_idx = []\n",
+    "num = 0\n",
+    "for c in counts:\n",
+    "    if c == '' or c == 'multiple':\n",
+    "        continue\n",
+    "    if counts[c] < 159:\n",
+    "        continue\n",
+    "    num += 1\n",
+    "    indices = [i for i, x in enumerate(categories) if x == c]\n",
+    "    balanced_idx.extend(choices(indices, k=159))\n",
+    "\n",
+    "print('Number of classes %d' % num)\n",
+    "print('Number of documents %d' % len(balanced_idx))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extract_and_write('../data/afp_balanced.txt', text, balanced_idx)\n",
+    "extract_and_write('../data/afp_balanced_label.txt', true1, balanced_idx)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/notebooks/out/lda_topics_afp_balanced.json b/notebooks/out/lda_topics_afp_balanced.json
@@ -0,0 +1 @@
+{"0": {"words": ["pope", "church", "francis", "visit", "catholic", "mass", "christian", "orthodox", "vatican", "religious"]}, "1": {"words": ["video", "picture", "live", "america", "africa", "north", "hold", "trial", "president", "graphic"]}, "2": {"words": ["royal", "japan", "king", "emperor", "palace", "ceremony", "throne", "thailand", "japanese", "thai"]}, "3": {"words": ["storm", "hurricane", "city", "rain", "wind", "river", "area", "orleans", "flooding", "hour"]}, "4": {"words": ["study", "scientist", "human", "woman", "university", "cancer", "found", "researcher", "percent", "team"]}, "5": {"words": ["win", "final", "cup", "game", "match", "team", "champion", "league", "season", "point"]}, "6": {"words": ["attack", "military", "force", "killed", "government", "group", "president", "security", "syria", "war"]}, "7": {"words": ["climate", "fire", "air", "change", "sea", "island", "water", "indonesia", "forest", "level"]}, "8": {"words": ["health", "case", "hospital", "ebola", "disease", "official", "outbreak", "child", "died", "killed"]}, "9": {"words": ["space", "moon", "mission", "edit", "earth", "nasa", "probe", "company", "business", "data"]}, "10": {"words": ["police", "woman", "court", "death", "migrant", "murder", "city", "victim", "found", "case"]}, "11": {"words": ["word", "woman", "moved", "party", "wrap", "politics", "election", "trump", "saudi", "president"]}, "12": {"words": ["percent", "china", "trade", "market", "chinese", "million", "kong", "economy", "deal", "hong"]}, "13": {"words": ["film", "work", "star", "show", "festival", "including", "life", "black", "music", "made"]}}
diff --git a/notebooks/out/lda_topics_afp_mono.json b/notebooks/out/lda_topics_afp_mono.json
@@ -0,0 +1 @@
+{"0": {"words": ["race", "team", "champion", "stage", "tour", "sec", "championship", "title", "season", "olympic"]}, "1": {"words": ["people", "police", "told", "city", "woman", "afp", "state", "family", "official", "hong"]}, "2": {"words": ["trump", "president", "state", "iran", "china", "country", "united", "north", "russia", "leader"]}, "3": {"words": ["company", "billion", "million", "percent", "sale", "market", "financial", "profit", "statement", "business"]}, "4": {"words": ["killed", "attack", "force", "people", "group", "syria", "military", "government", "state", "protest"]}, "5": {"words": ["business", "release", "www", "company", "contact", "businesswire", "content", "material", "patient", "technology"]}, "6": {"words": ["election", "party", "president", "vote", "minister", "government", "country", "trump", "leader", "political"]}, "7": {"words": ["england", "test", "australia", "wicket", "match", "run", "india", "cup", "ball", "pakistan"]}, "8": {"words": ["cup", "round", "group", "championship", "football", "open", "result", "par", "champion", "africa"]}, "9": {"words": ["cup", "team", "rugby", "game", "player", "coach", "england", "half", "match", "japan"]}, "10": {"words": ["open", "match", "set", "slam", "round", "title", "nadal", "champion", "seed", "federer"]}, "11": {"words": ["league", "game", "season", "goal", "point", "minute", "club", "champion", "team", "player"]}, "12": {"words": ["percent", "trade", "china", "market", "stock", "index", "point", "yen", "economy", "rate"]}, "13": {"words": ["video", "picture", "live", "america", "africa", "north", "graphic", "president", "hold", "trial"]}}
diff --git a/notebooks/out/lftm_topics_afp_balanced.json b/notebooks/out/lftm_topics_afp_balanced.json
@@ -0,0 +1 @@
+{"0": {"words": ["king", "queen", "emperor", "royal", "princess", "throne", "japanese", "great", "ceremony", "family"]}, "1": {"words": ["bur", "nrs", "sheeb", "jkb", "lanteaume", "pgf", "lto", "rbl", "jdg", "nin"]}, "2": {"words": ["space", "earth", "spacecraft", "orbit", "plane", "nasa", "moon", "research", "orbiting", "can"]}, "3": {"words": ["military", "attack", "army", "israel", "border", "government", "militant", "iraq", "killed", "saudi"]}, "4": {"words": ["disease", "hiv", "health", "cancer", "virus", "flu", "drug", "treatment", "hospital", "infection"]}, "5": {"words": ["percent", "market", "growth", "global", "billion", "economic", "investment", "china", "index", "price"]}, "6": {"words": ["said", "year", "people", "one", "country", "new", "last", "since", "also", "two"]}, "7": {"words": ["video", "picture", "live", "america", "africa", "north", "new", "trial", "graphic", "visit"]}, "8": {"words": ["water", "sea", "fish", "fossil", "animal", "human", "world", "natural", "earth", "ocean"]}, "9": {"words": ["summit", "president", "minister", "international", "meeting", "world", "european", "paris", "prime", "france"]}, "10": {"words": ["storm", "hurricane", "flooding", "city", "area", "police", "near", "coast", "injured", "people"]}, "11": {"words": ["church", "catholic", "pope", "visit", "orthodox", "people", "saint", "holy", "roman", "christian"]}, "12": {"words": ["woman", "police", "court", "child", "man", "sex", "murder", "party", "law", "case"]}, "13": {"words": ["win", "cup", "match", "game", "team", "final", "second", "league", "champion", "play"]}}
diff --git a/notebooks/out/lftm_topics_afp_mono.json b/notebooks/out/lftm_topics_afp_mono.json
@@ -0,0 +1 @@
+{"0": {"words": ["video", "picture", "live", "new", "america", "africa", "north", "graphic", "trial", "visit"]}, "1": {"words": ["killed", "people", "police", "said", "dead", "injured", "area", "reported", "around", "near"]}, "2": {"words": ["military", "israel", "iraq", "nuclear", "israeli", "government", "attack", "iran", "korea", "army"]}, "3": {"words": ["court", "police", "official", "chinese", "friday", "monday", "protest", "china", "thursday", "woman"]}, "4": {"words": ["england", "wicket", "cricket", "cup", "test", "match", "india", "australia", "batsman", "africa"]}, "5": {"words": ["cup", "france", "summit", "world", "international", "soccer", "paris", "european", "saturday", "club"]}, "6": {"words": ["champion", "win", "final", "second", "round", "tournament", "fourth", "world", "open", "title"]}, "7": {"words": ["def", "gbr", "ger", "swe", "easterby", "eng", "aprilia", "fra", "fdj", "ntags"]}, "8": {"words": ["game", "season", "team", "win", "big", "play", "coach", "player", "i", "star"]}, "9": {"words": ["percent", "market", "growth", "billion", "stock", "benchmark", "price", "earnings", "economic", "trade"]}, "10": {"words": ["win", "goal", "league", "game", "team", "season", "match", "striker", "cup", "second"]}, "11": {"words": ["said", "trump", "state", "president", "say", "city", "told", "china", "deal", "want"]}, "12": {"words": ["research", "information", "technology", "software", "company", "network", "global", "based", "data", "can"]}, "13": {"words": ["party", "election", "prime", "government", "president", "democratic", "opposition", "minister", "leader", "vote"]}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"0": {"words": ["pope", "church", "francis", "visit", "catholic", "mass", "christian", "orthodox", "vatican", "religious"]}, "1": {"words": ["video", "picture", "live", "america", "africa", "north", "hold", "trial", "president", "graphic"]}, "2": {"words": ["royal", "japan", "king", "emperor", "palace", "ceremony", "throne", "thailand", "japanese", "thai"]}, "3": {"words": ["storm", "hurricane", "city", "rain", "wind", "river", "area", "orleans", "flooding", "hour"]}, "4": {"words": ["study", "scientist", "human", "woman", "university", "cancer", "found", "researcher", "percent", "team"]}, "5": {"words": ["win", "final", "cup", "game", "match", "team", "champion", "league", "season", "point"]}, "6": {"words": ["attack", "military", "force", "killed", "government", "group", "president", "security", "syria", "war"]}, "7": {"words": ["climate", "fire", "air", "change", "sea", "island", "water", "indonesia", "forest", "level"]}, "8": {"words": ["health", "case", "hospital", "ebola", "disease", "official", "outbreak", "child", "died", "killed"]}, "9": {"words": ["space", "moon", "mission", "edit", "earth", "nasa", "probe", "company", "business", "data"]}, "10": {"words": ["police", "woman", "court", "death", "migrant", "murder", "city", "victim", "found", "case"]}, "11": {"words": ["word", "woman", "moved", "party", "wrap", "politics", "election", "trump", "saudi", "president"]}, "12": {"words": ["percent", "china", "trade", "market", "chinese", "million", "kong", "economy", "deal", "hong"]}, "13": {"words": ["film", "work", "star", "show", "festival", "including", "life", "black", "music", "made"]}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"0": {"words": ["race", "team", "champion", "stage", "tour", "sec", "championship", "title", "season", "olympic"]}, "1": {"words": ["people", "police", "told", "city", "woman", "afp", "state", "family", "official", "hong"]}, "2": {"words": ["trump", "president", "state", "iran", "china", "country", "united", "north", "russia", "leader"]}, "3": {"words": ["company", "billion", "million", "percent", "sale", "market", "financial", "profit", "statement", "business"]}, "4": {"words": ["killed", "attack", "force", "people", "group", "syria", "military", "government", "state", "protest"]}, "5": {"words": ["business", "release", "www", "company", "contact", "businesswire", "content", "material", "patient", "technology"]}, "6": {"words": ["election", "party", "president", "vote", "minister", "government", "country", "trump", "leader", "political"]}, "7": {"words": ["england", "test", "australia", "wicket", "match", "run", "india", "cup", "ball", "pakistan"]}, "8": {"words": ["cup", "round", "group", "championship", "football", "open", "result", "par", "champion", "africa"]}, "9": {"words": ["cup", "team", "rugby", "game", "player", "coach", "england", "half", "match", "japan"]}, "10": {"words": ["open", "match", "set", "slam", "round", "title", "nadal", "champion", "seed", "federer"]}, "11": {"words": ["league", "game", "season", "goal", "point", "minute", "club", "champion", "team", "player"]}, "12": {"words": ["percent", "trade", "china", "market", "stock", "index", "point", "yen", "economy", "rate"]}, "13": {"words": ["video", "picture", "live", "america", "africa", "north", "graphic", "president", "hold", "trial"]}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"0": {"words": ["king", "queen", "emperor", "royal", "princess", "throne", "japanese", "great", "ceremony", "family"]}, "1": {"words": ["bur", "nrs", "sheeb", "jkb", "lanteaume", "pgf", "lto", "rbl", "jdg", "nin"]}, "2": {"words": ["space", "earth", "spacecraft", "orbit", "plane", "nasa", "moon", "research", "orbiting", "can"]}, "3": {"words": ["military", "attack", "army", "israel", "border", "government", "militant", "iraq", "killed", "saudi"]}, "4": {"words": ["disease", "hiv", "health", "cancer", "virus", "flu", "drug", "treatment", "hospital", "infection"]}, "5": {"words": ["percent", "market", "growth", "global", "billion", "economic", "investment", "china", "index", "price"]}, "6": {"words": ["said", "year", "people", "one", "country", "new", "last", "since", "also", "two"]}, "7": {"words": ["video", "picture", "live", "america", "africa", "north", "new", "trial", "graphic", "visit"]}, "8": {"words": ["water", "sea", "fish", "fossil", "animal", "human", "world", "natural", "earth", "ocean"]}, "9": {"words": ["summit", "president", "minister", "international", "meeting", "world", "european", "paris", "prime", "france"]}, "10": {"words": ["storm", "hurricane", "flooding", "city", "area", "police", "near", "coast", "injured", "people"]}, "11": {"words": ["church", "catholic", "pope", "visit", "orthodox", "people", "saint", "holy", "roman", "christian"]}, "12": {"words": ["woman", "police", "court", "child", "man", "sex", "murder", "party", "law", "case"]}, "13": {"words": ["win", "cup", "match", "game", "team", "final", "second", "league", "champion", "play"]}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"0": {"words": ["video", "picture", "live", "new", "america", "africa", "north", "graphic", "trial", "visit"]}, "1": {"words": ["killed", "people", "police", "said", "dead", "injured", "area", "reported", "around", "near"]}, "2": {"words": ["military", "israel", "iraq", "nuclear", "israeli", "government", "attack", "iran", "korea", "army"]}, "3": {"words": ["court", "police", "official", "chinese", "friday", "monday", "protest", "china", "thursday", "woman"]}, "4": {"words": ["england", "wicket", "cricket", "cup", "test", "match", "india", "australia", "batsman", "africa"]}, "5": {"words": ["cup", "france", "summit", "world", "international", "soccer", "paris", "european", "saturday", "club"]}, "6": {"words": ["champion", "win", "final", "second", "round", "tournament", "fourth", "world", "open", "title"]}, "7": {"words": ["def", "gbr", "ger", "swe", "easterby", "eng", "aprilia", "fra", "fdj", "ntags"]}, "8": {"words": ["game", "season", "team", "win", "big", "play", "coach", "player", "i", "star"]}, "9": {"words": ["percent", "market", "growth", "billion", "stock", "benchmark", "price", "earnings", "economic", "trade"]}, "10": {"words": ["win", "goal", "league", "game", "team", "season", "match", "striker", "cup", "second"]}, "11": {"words": ["said", "trump", "state", "president", "say", "city", "told", "china", "deal", "want"]}, "12": {"words": ["research", "information", "technology", "software", "company", "network", "global", "based", "data", "can"]}, "13": {"words": ["party", "election", "prime", "government", "president", "democratic", "opposition", "minister", "leader", "vote"]}}