-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
28b0ecf
commit c8881c9
Showing
8 changed files
with
72,332 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# AFP balanced dataset\n", | ||
"I want to extract from this dataset a balanced one respect to the categories in the ground truth" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dataset = '../data/afp.txt'\n", | ||
"labels = '../data/afp_labels.txt'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"[',,',\n", | ||
" '13000000,13007000 13006000 13008000,',\n", | ||
" '11000000,11014000 11001000 11002000 11006000,',\n", | ||
" '11000000,11014000 11001000 11002000 11006000,',\n", | ||
" '11000000,11014000 11001000 11002000 11006000,',\n", | ||
" '11000000,11014000 11001000 11002000 11006000,',\n", | ||
" '15000000,,',\n", | ||
" '11000000,11014000 11001000 11002000 11006000,',\n", | ||
" '02000000,02003000 02001000 16001000,02003001',\n", | ||
" '15000000,15054000,']" | ||
] | ||
}, | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"with open(labels, \"r\") as datafile:\n", | ||
" true_topics = [line.rstrip() for line in datafile if line]\n", | ||
"true_topics[0:10]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"('',\n", | ||
" '13000000',\n", | ||
" '11000000',\n", | ||
" '11000000',\n", | ||
" '11000000',\n", | ||
" '11000000',\n", | ||
" '15000000',\n", | ||
" '11000000',\n", | ||
" '02000000',\n", | ||
" '15000000')" | ||
] | ||
}, | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"true1, true2, true3 = zip(*[line.split(',') for line in true_topics])\n", | ||
"true1[0:10]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"Counter({'': 3542,\n", | ||
" '13000000': 419,\n", | ||
" '11000000': 12703,\n", | ||
" '15000000': 34025,\n", | ||
" '02000000': 2161,\n", | ||
" 'multiple': 52838,\n", | ||
" '03000000': 1606,\n", | ||
" '10000000': 61,\n", | ||
" '08000000': 260,\n", | ||
" '16000000': 5462,\n", | ||
" '04000000': 9778,\n", | ||
" '12000000': 196,\n", | ||
" '01000000': 1509,\n", | ||
" '06000000': 269,\n", | ||
" '05000000': 13,\n", | ||
" '17000000': 172,\n", | ||
" '07000000': 335,\n", | ||
" '09000000': 8,\n", | ||
" '14000000': 159})" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"categories = [x if ' ' not in x else 'multiple' for x in true1]\n", | ||
"\n", | ||
"from collections import Counter\n", | ||
"counts = Counter(categories)\n", | ||
"counts" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"with open(dataset, \"r\") as datafile:\n", | ||
" text = [line.rstrip() for line in datafile if line]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Subset 1: Only mono-label entries" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Number of classes 14\n", | ||
"Number of documents 69054\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"selected_idx = []\n", | ||
"num = 0\n", | ||
"for c in counts:\n", | ||
" if c == '' or c == 'multiple':\n", | ||
" continue\n", | ||
" if counts[c] < 159:\n", | ||
" continue\n", | ||
" num += 1\n", | ||
" indices = [i for i, x in enumerate(categories) if x == c]\n", | ||
" selected_idx.extend(indices)\n", | ||
"\n", | ||
"\n", | ||
"print('Number of classes %d' % num)\n", | ||
"print('Number of documents %d' % len(selected_idx))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def extract_and_write(name, dataset, idx):\n", | ||
" with open(name, 'w') as f:\n", | ||
" for i in idx:\n", | ||
" f.write(dataset[i])\n", | ||
" f.write('\\n')\n", | ||
" \n", | ||
"extract_and_write('../data/afp_mono.txt', text, selected_idx)\n", | ||
"extract_and_write('../data/afp_mono_label.txt', true1, selected_idx)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Subset 2: Only mono-label entries, balanced" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Number of classes 14\n", | ||
"Number of documents 2226\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from random import choices\n", | ||
"\n", | ||
"balanced_idx = []\n", | ||
"num = 0\n", | ||
"for c in counts:\n", | ||
" if c == '' or c == 'multiple':\n", | ||
" continue\n", | ||
" if counts[c] < 159:\n", | ||
" continue\n", | ||
" num += 1\n", | ||
" indices = [i for i, x in enumerate(categories) if x == c]\n", | ||
" balanced_idx.extend(choices(indices, k=159))\n", | ||
"\n", | ||
"print('Number of classes %d' % num)\n", | ||
"print('Number of documents %d' % len(balanced_idx))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"extract_and_write('../data/afp_balanced.txt', text, balanced_idx)\n", | ||
"extract_and_write('../data/afp_balanced_label.txt', true1, balanced_idx)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.7" | ||
}, | ||
"pycharm": { | ||
"stem_cell": { | ||
"cell_type": "raw", | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"source": [] | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"0": {"words": ["pope", "church", "francis", "visit", "catholic", "mass", "christian", "orthodox", "vatican", "religious"]}, "1": {"words": ["video", "picture", "live", "america", "africa", "north", "hold", "trial", "president", "graphic"]}, "2": {"words": ["royal", "japan", "king", "emperor", "palace", "ceremony", "throne", "thailand", "japanese", "thai"]}, "3": {"words": ["storm", "hurricane", "city", "rain", "wind", "river", "area", "orleans", "flooding", "hour"]}, "4": {"words": ["study", "scientist", "human", "woman", "university", "cancer", "found", "researcher", "percent", "team"]}, "5": {"words": ["win", "final", "cup", "game", "match", "team", "champion", "league", "season", "point"]}, "6": {"words": ["attack", "military", "force", "killed", "government", "group", "president", "security", "syria", "war"]}, "7": {"words": ["climate", "fire", "air", "change", "sea", "island", "water", "indonesia", "forest", "level"]}, "8": {"words": ["health", "case", "hospital", "ebola", "disease", "official", "outbreak", "child", "died", "killed"]}, "9": {"words": ["space", "moon", "mission", "edit", "earth", "nasa", "probe", "company", "business", "data"]}, "10": {"words": ["police", "woman", "court", "death", "migrant", "murder", "city", "victim", "found", "case"]}, "11": {"words": ["word", "woman", "moved", "party", "wrap", "politics", "election", "trump", "saudi", "president"]}, "12": {"words": ["percent", "china", "trade", "market", "chinese", "million", "kong", "economy", "deal", "hong"]}, "13": {"words": ["film", "work", "star", "show", "festival", "including", "life", "black", "music", "made"]}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"0": {"words": ["race", "team", "champion", "stage", "tour", "sec", "championship", "title", "season", "olympic"]}, "1": {"words": ["people", "police", "told", "city", "woman", "afp", "state", "family", "official", "hong"]}, "2": {"words": ["trump", "president", "state", "iran", "china", "country", "united", "north", "russia", "leader"]}, "3": {"words": ["company", "billion", "million", "percent", "sale", "market", "financial", "profit", "statement", "business"]}, "4": {"words": ["killed", "attack", "force", "people", "group", "syria", "military", "government", "state", "protest"]}, "5": {"words": ["business", "release", "www", "company", "contact", "businesswire", "content", "material", "patient", "technology"]}, "6": {"words": ["election", "party", "president", "vote", "minister", "government", "country", "trump", "leader", "political"]}, "7": {"words": ["england", "test", "australia", "wicket", "match", "run", "india", "cup", "ball", "pakistan"]}, "8": {"words": ["cup", "round", "group", "championship", "football", "open", "result", "par", "champion", "africa"]}, "9": {"words": ["cup", "team", "rugby", "game", "player", "coach", "england", "half", "match", "japan"]}, "10": {"words": ["open", "match", "set", "slam", "round", "title", "nadal", "champion", "seed", "federer"]}, "11": {"words": ["league", "game", "season", "goal", "point", "minute", "club", "champion", "team", "player"]}, "12": {"words": ["percent", "trade", "china", "market", "stock", "index", "point", "yen", "economy", "rate"]}, "13": {"words": ["video", "picture", "live", "america", "africa", "north", "graphic", "president", "hold", "trial"]}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"0": {"words": ["king", "queen", "emperor", "royal", "princess", "throne", "japanese", "great", "ceremony", "family"]}, "1": {"words": ["bur", "nrs", "sheeb", "jkb", "lanteaume", "pgf", "lto", "rbl", "jdg", "nin"]}, "2": {"words": ["space", "earth", "spacecraft", "orbit", "plane", "nasa", "moon", "research", "orbiting", "can"]}, "3": {"words": ["military", "attack", "army", "israel", "border", "government", "militant", "iraq", "killed", "saudi"]}, "4": {"words": ["disease", "hiv", "health", "cancer", "virus", "flu", "drug", "treatment", "hospital", "infection"]}, "5": {"words": ["percent", "market", "growth", "global", "billion", "economic", "investment", "china", "index", "price"]}, "6": {"words": ["said", "year", "people", "one", "country", "new", "last", "since", "also", "two"]}, "7": {"words": ["video", "picture", "live", "america", "africa", "north", "new", "trial", "graphic", "visit"]}, "8": {"words": ["water", "sea", "fish", "fossil", "animal", "human", "world", "natural", "earth", "ocean"]}, "9": {"words": ["summit", "president", "minister", "international", "meeting", "world", "european", "paris", "prime", "france"]}, "10": {"words": ["storm", "hurricane", "flooding", "city", "area", "police", "near", "coast", "injured", "people"]}, "11": {"words": ["church", "catholic", "pope", "visit", "orthodox", "people", "saint", "holy", "roman", "christian"]}, "12": {"words": ["woman", "police", "court", "child", "man", "sex", "murder", "party", "law", "case"]}, "13": {"words": ["win", "cup", "match", "game", "team", "final", "second", "league", "champion", "play"]}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"0": {"words": ["video", "picture", "live", "new", "america", "africa", "north", "graphic", "trial", "visit"]}, "1": {"words": ["killed", "people", "police", "said", "dead", "injured", "area", "reported", "around", "near"]}, "2": {"words": ["military", "israel", "iraq", "nuclear", "israeli", "government", "attack", "iran", "korea", "army"]}, "3": {"words": ["court", "police", "official", "chinese", "friday", "monday", "protest", "china", "thursday", "woman"]}, "4": {"words": ["england", "wicket", "cricket", "cup", "test", "match", "india", "australia", "batsman", "africa"]}, "5": {"words": ["cup", "france", "summit", "world", "international", "soccer", "paris", "european", "saturday", "club"]}, "6": {"words": ["champion", "win", "final", "second", "round", "tournament", "fourth", "world", "open", "title"]}, "7": {"words": ["def", "gbr", "ger", "swe", "easterby", "eng", "aprilia", "fra", "fdj", "ntags"]}, "8": {"words": ["game", "season", "team", "win", "big", "play", "coach", "player", "i", "star"]}, "9": {"words": ["percent", "market", "growth", "billion", "stock", "benchmark", "price", "earnings", "economic", "trade"]}, "10": {"words": ["win", "goal", "league", "game", "team", "season", "match", "striker", "cup", "second"]}, "11": {"words": ["said", "trump", "state", "president", "say", "city", "told", "china", "deal", "want"]}, "12": {"words": ["research", "information", "technology", "software", "company", "network", "global", "based", "data", "can"]}, "13": {"words": ["party", "election", "prime", "government", "president", "democratic", "opposition", "minister", "leader", "vote"]}} |
Oops, something went wrong.