Skip to content

Commit

Permalink
modified thesaurus.py to fix singulalirzation
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewkumar committed May 3, 2020
1 parent 1136e71 commit f6c18a0
Show file tree
Hide file tree
Showing 22 changed files with 6,034 additions and 6,816 deletions.
364 changes: 364 additions & 0 deletions .ipynb_checkpoints/Continuity-Margins-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,364 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from digital_manuscript import BnF\n",
"manuscript = BnF()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.DataFrame(columns=['entry_id', 'version', 'position', 'render', 'length'])\n",
"i = 0\n",
"for identity, entry in manuscript.entries.items():\n",
" if len(entry.margins) > 0:\n",
" for version, margin_list in entry.margins.items():\n",
" for margin in margin_list:\n",
" df.loc[i] = [identity, version, margin.position, margin.render, margin.length]\n",
" i += 1\n",
"df.to_csv('margins.csv', index=False)\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def find_percent_marginal(entry, version):\n",
" if len(entry.margins) == 0:\n",
" return 0\n",
"\n",
" margin_sum = 0\n",
" for margin in entry.margins[version]:\n",
" margin_sum += margin.length\n",
" \n",
" return 100 * margin_sum/entry.length[version]\n",
"\n",
"def find_percent_continued(entry, version):\n",
" continued_sum = 0\n",
" parts = re.findall(r'(<div([\\w\\s=\";-]*)>(.*?)</div>)', re.sub(r'\\s+', ' ', entry.text(version, True)))\n",
" for part in parts:\n",
" _, attributes, text = part\n",
" if 'continues=\"yes\"' in attributes:\n",
" text = re.sub(r'<.*?>', '', text)\n",
" continued_sum += len(text)\n",
" return 100 * continued_sum / entry.length[version]\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>entry_id</th>\n",
" <th>length_tc</th>\n",
" <th>length_tcn</th>\n",
" <th>length_tl</th>\n",
" <th>percent_marginal_tc</th>\n",
" <th>percent_marginal_tcn</th>\n",
" <th>percent_marginal_tl</th>\n",
" <th>percent_continued_tc</th>\n",
" <th>percent_continued_tcn</th>\n",
" <th>percent_continued_tl</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>001r_1</td>\n",
" <td>292</td>\n",
" <td>302</td>\n",
" <td>299</td>\n",
" <td>95.205479</td>\n",
" <td>95.033113</td>\n",
" <td>94.983278</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>001r_2</td>\n",
" <td>133</td>\n",
" <td>140</td>\n",
" <td>131</td>\n",
" <td>96.240602</td>\n",
" <td>95.000000</td>\n",
" <td>96.183206</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>001r_3</td>\n",
" <td>1502</td>\n",
" <td>1513</td>\n",
" <td>1511</td>\n",
" <td>97.936085</td>\n",
" <td>97.951091</td>\n",
" <td>98.146923</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>001r_4</td>\n",
" <td>94</td>\n",
" <td>98</td>\n",
" <td>96</td>\n",
" <td>81.914894</td>\n",
" <td>82.653061</td>\n",
" <td>84.375000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>001v_1</td>\n",
" <td>392</td>\n",
" <td>406</td>\n",
" <td>372</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>924</th>\n",
" <td>170r_6</td>\n",
" <td>631</td>\n",
" <td>647</td>\n",
" <td>580</td>\n",
" <td>35.340729</td>\n",
" <td>35.703246</td>\n",
" <td>36.724138</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>925</th>\n",
" <td>170v_1</td>\n",
" <td>498</td>\n",
" <td>507</td>\n",
" <td>523</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>926</th>\n",
" <td>170v_2</td>\n",
" <td>269</td>\n",
" <td>278</td>\n",
" <td>282</td>\n",
" <td>16.728625</td>\n",
" <td>15.827338</td>\n",
" <td>15.957447</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927</th>\n",
" <td>170v_3</td>\n",
" <td>497</td>\n",
" <td>510</td>\n",
" <td>536</td>\n",
" <td>12.273642</td>\n",
" <td>11.764706</td>\n",
" <td>11.567164</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>928</th>\n",
" <td>170v_4</td>\n",
" <td>286</td>\n",
" <td>296</td>\n",
" <td>292</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>929 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" entry_id length_tc length_tcn length_tl percent_marginal_tc \\\n",
"0 001r_1 292 302 299 95.205479 \n",
"1 001r_2 133 140 131 96.240602 \n",
"2 001r_3 1502 1513 1511 97.936085 \n",
"3 001r_4 94 98 96 81.914894 \n",
"4 001v_1 392 406 372 0.000000 \n",
".. ... ... ... ... ... \n",
"924 170r_6 631 647 580 35.340729 \n",
"925 170v_1 498 507 523 0.000000 \n",
"926 170v_2 269 278 282 16.728625 \n",
"927 170v_3 497 510 536 12.273642 \n",
"928 170v_4 286 296 292 0.000000 \n",
"\n",
" percent_marginal_tcn percent_marginal_tl percent_continued_tc \\\n",
"0 95.033113 94.983278 0.0 \n",
"1 95.000000 96.183206 0.0 \n",
"2 97.951091 98.146923 0.0 \n",
"3 82.653061 84.375000 0.0 \n",
"4 0.000000 0.000000 0.0 \n",
".. ... ... ... \n",
"924 35.703246 36.724138 0.0 \n",
"925 0.000000 0.000000 0.0 \n",
"926 15.827338 15.957447 0.0 \n",
"927 11.764706 11.567164 0.0 \n",
"928 0.000000 0.000000 0.0 \n",
"\n",
" percent_continued_tcn percent_continued_tl \n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
".. ... ... \n",
"924 0.0 0.0 \n",
"925 0.0 0.0 \n",
"926 0.0 0.0 \n",
"927 0.0 0.0 \n",
"928 0.0 0.0 \n",
"\n",
"[929 rows x 10 columns]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(columns=['entry_id', 'length_tc', 'length_tcn', 'length_tl',\n",
" 'percent_marginal_tc', 'percent_marginal_tcn', 'percent_marginal_tl',\n",
" 'percent_continued_tc', 'percent_continued_tcn', 'percent_continued_tl'])\n",
"versions = ['tc', 'tcn', 'tl']\n",
"i=0\n",
"\n",
"for identity, entry in manuscript.entries.items():\n",
" if len(entry.margins) > 0 or 'continues=\"yes\"' in entry.text('tl', xml=True):\n",
" percent_marginal, percent_continued = {}, {}\n",
" for version in versions:\n",
" percent_marginal[version] = find_percent_marginal(entry, version)\n",
" percent_continued[version] = find_percent_continued(entry, version)\n",
"\n",
" df.loc[i] = [entry.identity, entry.length['tc'], entry.length['tcn'], entry.length['tl'],\n",
" percent_marginal['tc'], percent_marginal['tcn'], percent_marginal['tl'],\n",
" percent_continued['tc'], percent_continued['tcn'], percent_continued['tl']]\n",
" i += 1\n",
"df\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit f6c18a0

Please sign in to comment.