Skip to content

Commit

Permalink
prediction afp mono and balanced
Browse files Browse the repository at this point in the history
  • Loading branch information
pasqLisena committed May 15, 2020
1 parent 28b0ecf commit c8881c9
Show file tree
Hide file tree
Showing 8 changed files with 72,332 additions and 4 deletions.
268 changes: 268 additions & 0 deletions notebooks/afp_balancing.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# AFP balanced dataset\n",
"I want to extract from this dataset a balanced one respect to the categories in the ground truth"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"dataset = '../data/afp.txt'\n",
"labels = '../data/afp_labels.txt'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[',,',\n",
" '13000000,13007000 13006000 13008000,',\n",
" '11000000,11014000 11001000 11002000 11006000,',\n",
" '11000000,11014000 11001000 11002000 11006000,',\n",
" '11000000,11014000 11001000 11002000 11006000,',\n",
" '11000000,11014000 11001000 11002000 11006000,',\n",
" '15000000,,',\n",
" '11000000,11014000 11001000 11002000 11006000,',\n",
" '02000000,02003000 02001000 16001000,02003001',\n",
" '15000000,15054000,']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open(labels, \"r\") as datafile:\n",
" true_topics = [line.rstrip() for line in datafile if line]\n",
"true_topics[0:10]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('',\n",
" '13000000',\n",
" '11000000',\n",
" '11000000',\n",
" '11000000',\n",
" '11000000',\n",
" '15000000',\n",
" '11000000',\n",
" '02000000',\n",
" '15000000')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"true1, true2, true3 = zip(*[line.split(',') for line in true_topics])\n",
"true1[0:10]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter({'': 3542,\n",
" '13000000': 419,\n",
" '11000000': 12703,\n",
" '15000000': 34025,\n",
" '02000000': 2161,\n",
" 'multiple': 52838,\n",
" '03000000': 1606,\n",
" '10000000': 61,\n",
" '08000000': 260,\n",
" '16000000': 5462,\n",
" '04000000': 9778,\n",
" '12000000': 196,\n",
" '01000000': 1509,\n",
" '06000000': 269,\n",
" '05000000': 13,\n",
" '17000000': 172,\n",
" '07000000': 335,\n",
" '09000000': 8,\n",
" '14000000': 159})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"categories = [x if ' ' not in x else 'multiple' for x in true1]\n",
"\n",
"from collections import Counter\n",
"counts = Counter(categories)\n",
"counts"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"with open(dataset, \"r\") as datafile:\n",
" text = [line.rstrip() for line in datafile if line]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Subset 1: Only mono-label entries"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of classes 14\n",
"Number of documents 69054\n"
]
}
],
"source": [
"selected_idx = []\n",
"num = 0\n",
"for c in counts:\n",
" if c == '' or c == 'multiple':\n",
" continue\n",
" if counts[c] < 159:\n",
" continue\n",
" num += 1\n",
" indices = [i for i, x in enumerate(categories) if x == c]\n",
" selected_idx.extend(indices)\n",
"\n",
"\n",
"print('Number of classes %d' % num)\n",
"print('Number of documents %d' % len(selected_idx))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def extract_and_write(name, dataset, idx):\n",
" with open(name, 'w') as f:\n",
" for i in idx:\n",
" f.write(dataset[i])\n",
" f.write('\\n')\n",
" \n",
"extract_and_write('../data/afp_mono.txt', text, selected_idx)\n",
"extract_and_write('../data/afp_mono_label.txt', true1, selected_idx)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Subset 2: Only mono-label entries, balanced"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of classes 14\n",
"Number of documents 2226\n"
]
}
],
"source": [
"from random import choices\n",
"\n",
"balanced_idx = []\n",
"num = 0\n",
"for c in counts:\n",
" if c == '' or c == 'multiple':\n",
" continue\n",
" if counts[c] < 159:\n",
" continue\n",
" num += 1\n",
" indices = [i for i, x in enumerate(categories) if x == c]\n",
" balanced_idx.extend(choices(indices, k=159))\n",
"\n",
"print('Number of classes %d' % num)\n",
"print('Number of documents %d' % len(balanced_idx))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"extract_and_write('../data/afp_balanced.txt', text, balanced_idx)\n",
"extract_and_write('../data/afp_balanced_label.txt', true1, balanced_idx)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 1
}
1 change: 1 addition & 0 deletions notebooks/out/lda_topics_afp_balanced.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"0": {"words": ["pope", "church", "francis", "visit", "catholic", "mass", "christian", "orthodox", "vatican", "religious"]}, "1": {"words": ["video", "picture", "live", "america", "africa", "north", "hold", "trial", "president", "graphic"]}, "2": {"words": ["royal", "japan", "king", "emperor", "palace", "ceremony", "throne", "thailand", "japanese", "thai"]}, "3": {"words": ["storm", "hurricane", "city", "rain", "wind", "river", "area", "orleans", "flooding", "hour"]}, "4": {"words": ["study", "scientist", "human", "woman", "university", "cancer", "found", "researcher", "percent", "team"]}, "5": {"words": ["win", "final", "cup", "game", "match", "team", "champion", "league", "season", "point"]}, "6": {"words": ["attack", "military", "force", "killed", "government", "group", "president", "security", "syria", "war"]}, "7": {"words": ["climate", "fire", "air", "change", "sea", "island", "water", "indonesia", "forest", "level"]}, "8": {"words": ["health", "case", "hospital", "ebola", "disease", "official", "outbreak", "child", "died", "killed"]}, "9": {"words": ["space", "moon", "mission", "edit", "earth", "nasa", "probe", "company", "business", "data"]}, "10": {"words": ["police", "woman", "court", "death", "migrant", "murder", "city", "victim", "found", "case"]}, "11": {"words": ["word", "woman", "moved", "party", "wrap", "politics", "election", "trump", "saudi", "president"]}, "12": {"words": ["percent", "china", "trade", "market", "chinese", "million", "kong", "economy", "deal", "hong"]}, "13": {"words": ["film", "work", "star", "show", "festival", "including", "life", "black", "music", "made"]}}
1 change: 1 addition & 0 deletions notebooks/out/lda_topics_afp_mono.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"0": {"words": ["race", "team", "champion", "stage", "tour", "sec", "championship", "title", "season", "olympic"]}, "1": {"words": ["people", "police", "told", "city", "woman", "afp", "state", "family", "official", "hong"]}, "2": {"words": ["trump", "president", "state", "iran", "china", "country", "united", "north", "russia", "leader"]}, "3": {"words": ["company", "billion", "million", "percent", "sale", "market", "financial", "profit", "statement", "business"]}, "4": {"words": ["killed", "attack", "force", "people", "group", "syria", "military", "government", "state", "protest"]}, "5": {"words": ["business", "release", "www", "company", "contact", "businesswire", "content", "material", "patient", "technology"]}, "6": {"words": ["election", "party", "president", "vote", "minister", "government", "country", "trump", "leader", "political"]}, "7": {"words": ["england", "test", "australia", "wicket", "match", "run", "india", "cup", "ball", "pakistan"]}, "8": {"words": ["cup", "round", "group", "championship", "football", "open", "result", "par", "champion", "africa"]}, "9": {"words": ["cup", "team", "rugby", "game", "player", "coach", "england", "half", "match", "japan"]}, "10": {"words": ["open", "match", "set", "slam", "round", "title", "nadal", "champion", "seed", "federer"]}, "11": {"words": ["league", "game", "season", "goal", "point", "minute", "club", "champion", "team", "player"]}, "12": {"words": ["percent", "trade", "china", "market", "stock", "index", "point", "yen", "economy", "rate"]}, "13": {"words": ["video", "picture", "live", "america", "africa", "north", "graphic", "president", "hold", "trial"]}}
1 change: 1 addition & 0 deletions notebooks/out/lftm_topics_afp_balanced.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"0": {"words": ["king", "queen", "emperor", "royal", "princess", "throne", "japanese", "great", "ceremony", "family"]}, "1": {"words": ["bur", "nrs", "sheeb", "jkb", "lanteaume", "pgf", "lto", "rbl", "jdg", "nin"]}, "2": {"words": ["space", "earth", "spacecraft", "orbit", "plane", "nasa", "moon", "research", "orbiting", "can"]}, "3": {"words": ["military", "attack", "army", "israel", "border", "government", "militant", "iraq", "killed", "saudi"]}, "4": {"words": ["disease", "hiv", "health", "cancer", "virus", "flu", "drug", "treatment", "hospital", "infection"]}, "5": {"words": ["percent", "market", "growth", "global", "billion", "economic", "investment", "china", "index", "price"]}, "6": {"words": ["said", "year", "people", "one", "country", "new", "last", "since", "also", "two"]}, "7": {"words": ["video", "picture", "live", "america", "africa", "north", "new", "trial", "graphic", "visit"]}, "8": {"words": ["water", "sea", "fish", "fossil", "animal", "human", "world", "natural", "earth", "ocean"]}, "9": {"words": ["summit", "president", "minister", "international", "meeting", "world", "european", "paris", "prime", "france"]}, "10": {"words": ["storm", "hurricane", "flooding", "city", "area", "police", "near", "coast", "injured", "people"]}, "11": {"words": ["church", "catholic", "pope", "visit", "orthodox", "people", "saint", "holy", "roman", "christian"]}, "12": {"words": ["woman", "police", "court", "child", "man", "sex", "murder", "party", "law", "case"]}, "13": {"words": ["win", "cup", "match", "game", "team", "final", "second", "league", "champion", "play"]}}
1 change: 1 addition & 0 deletions notebooks/out/lftm_topics_afp_mono.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"0": {"words": ["video", "picture", "live", "new", "america", "africa", "north", "graphic", "trial", "visit"]}, "1": {"words": ["killed", "people", "police", "said", "dead", "injured", "area", "reported", "around", "near"]}, "2": {"words": ["military", "israel", "iraq", "nuclear", "israeli", "government", "attack", "iran", "korea", "army"]}, "3": {"words": ["court", "police", "official", "chinese", "friday", "monday", "protest", "china", "thursday", "woman"]}, "4": {"words": ["england", "wicket", "cricket", "cup", "test", "match", "india", "australia", "batsman", "africa"]}, "5": {"words": ["cup", "france", "summit", "world", "international", "soccer", "paris", "european", "saturday", "club"]}, "6": {"words": ["champion", "win", "final", "second", "round", "tournament", "fourth", "world", "open", "title"]}, "7": {"words": ["def", "gbr", "ger", "swe", "easterby", "eng", "aprilia", "fra", "fdj", "ntags"]}, "8": {"words": ["game", "season", "team", "win", "big", "play", "coach", "player", "i", "star"]}, "9": {"words": ["percent", "market", "growth", "billion", "stock", "benchmark", "price", "earnings", "economic", "trade"]}, "10": {"words": ["win", "goal", "league", "game", "team", "season", "match", "striker", "cup", "second"]}, "11": {"words": ["said", "trump", "state", "president", "say", "city", "told", "china", "deal", "want"]}, "12": {"words": ["research", "information", "technology", "software", "company", "network", "global", "based", "data", "can"]}, "13": {"words": ["party", "election", "prime", "government", "president", "democratic", "opposition", "minister", "leader", "vote"]}}
Loading

0 comments on commit c8881c9

Please sign in to comment.