Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
angelovagalina authored Oct 12, 2021
1 parent 8741063 commit 229c23c
Show file tree
Hide file tree
Showing 6 changed files with 67,716 additions and 0 deletions.
144 changes: 144 additions & 0 deletions notebooks/create-train-set.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#imports \n",
"import pandas as pd \n",
"import re \n",
"from pathlib import Path \n",
"import csv \n",
"\n",
"from tqdm import tqdm "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define in/out files"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"#### Set sizes \n",
" - training data: 54325 \n",
" - test data: 5113 [54325:59438] \n",
" - dev data: 4470 [59438:63908] \n",
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate training data "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def generate_train(in_csv, out_txt, n=54325):\n",
" df = pd.read_csv(in_csv, header=None) \n",
" out_lines = df[0].tolist()[:4470] \n",
"\n",
" with open(out_txt, 'w', encoding='utf-8') as f:\n",
" for line in tqdm(out_lines):\n",
" if (type(line) == float):\n",
" f.write(str(line)) \n",
" else:\n",
" f.write(line)\n",
" f.write('\\n') "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate test data "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def generate_test(in_csv, out_txt, n=5113):\n",
" df = pd.read_csv(in_csv, header=None)\n",
" out_lines = df[0].tolist()[54325:59438]\n",
" \n",
" with open(out_txt, 'w', encoding='utf-8') as f:\n",
" for line in tqdm(out_lines):\n",
" if (type(line) == float):\n",
" f.write(str(line)) \n",
" else:\n",
" f.write(line)\n",
" f.write('\\n') "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate dev data "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def generate_dev(in_csv, out_txt, n=4470):\n",
" df = pd.read_csv(in_csv, header=None) \n",
" out_lines = df[0].tolist()[59438:63908]\n",
" \n",
" with open(out_txt, 'w', encoding='utf-8') as f:\n",
" for line in tqdm(out_lines):\n",
" if (type(line) == float):\n",
" f.write(str(line)) \n",
" else:\n",
" f.write(line) \n",
" f.write('\\n') "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 229c23c

Please sign in to comment.