-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
modified thesaurus.py to fix singulalirzation
- Loading branch information
1 parent
1136e71
commit f6c18a0
Showing
22 changed files
with
6,034 additions
and
6,816 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,364 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from digital_manuscript import BnF\n", | ||
"manuscript = BnF()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"df = pd.DataFrame(columns=['entry_id', 'version', 'position', 'render', 'length'])\n", | ||
"i = 0\n", | ||
"for identity, entry in manuscript.entries.items():\n", | ||
" if len(entry.margins) > 0:\n", | ||
" for version, margin_list in entry.margins.items():\n", | ||
" for margin in margin_list:\n", | ||
" df.loc[i] = [identity, version, margin.position, margin.render, margin.length]\n", | ||
" i += 1\n", | ||
"df.to_csv('margins.csv', index=False)\n", | ||
" \n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import re\n", | ||
"\n", | ||
"def find_percent_marginal(entry, version):\n", | ||
" if len(entry.margins) == 0:\n", | ||
" return 0\n", | ||
"\n", | ||
" margin_sum = 0\n", | ||
" for margin in entry.margins[version]:\n", | ||
" margin_sum += margin.length\n", | ||
" \n", | ||
" return 100 * margin_sum/entry.length[version]\n", | ||
"\n", | ||
"def find_percent_continued(entry, version):\n", | ||
" continued_sum = 0\n", | ||
" parts = re.findall(r'(<div([\\w\\s=\";-]*)>(.*?)</div>)', re.sub(r'\\s+', ' ', entry.text(version, True)))\n", | ||
" for part in parts:\n", | ||
" _, attributes, text = part\n", | ||
" if 'continues=\"yes\"' in attributes:\n", | ||
" text = re.sub(r'<.*?>', '', text)\n", | ||
" continued_sum += len(text)\n", | ||
" return 100 * continued_sum / entry.length[version]\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 53, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>entry_id</th>\n", | ||
" <th>length_tc</th>\n", | ||
" <th>length_tcn</th>\n", | ||
" <th>length_tl</th>\n", | ||
" <th>percent_marginal_tc</th>\n", | ||
" <th>percent_marginal_tcn</th>\n", | ||
" <th>percent_marginal_tl</th>\n", | ||
" <th>percent_continued_tc</th>\n", | ||
" <th>percent_continued_tcn</th>\n", | ||
" <th>percent_continued_tl</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>001r_1</td>\n", | ||
" <td>292</td>\n", | ||
" <td>302</td>\n", | ||
" <td>299</td>\n", | ||
" <td>95.205479</td>\n", | ||
" <td>95.033113</td>\n", | ||
" <td>94.983278</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>001r_2</td>\n", | ||
" <td>133</td>\n", | ||
" <td>140</td>\n", | ||
" <td>131</td>\n", | ||
" <td>96.240602</td>\n", | ||
" <td>95.000000</td>\n", | ||
" <td>96.183206</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>001r_3</td>\n", | ||
" <td>1502</td>\n", | ||
" <td>1513</td>\n", | ||
" <td>1511</td>\n", | ||
" <td>97.936085</td>\n", | ||
" <td>97.951091</td>\n", | ||
" <td>98.146923</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>001r_4</td>\n", | ||
" <td>94</td>\n", | ||
" <td>98</td>\n", | ||
" <td>96</td>\n", | ||
" <td>81.914894</td>\n", | ||
" <td>82.653061</td>\n", | ||
" <td>84.375000</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>001v_1</td>\n", | ||
" <td>392</td>\n", | ||
" <td>406</td>\n", | ||
" <td>372</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>...</th>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>924</th>\n", | ||
" <td>170r_6</td>\n", | ||
" <td>631</td>\n", | ||
" <td>647</td>\n", | ||
" <td>580</td>\n", | ||
" <td>35.340729</td>\n", | ||
" <td>35.703246</td>\n", | ||
" <td>36.724138</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>925</th>\n", | ||
" <td>170v_1</td>\n", | ||
" <td>498</td>\n", | ||
" <td>507</td>\n", | ||
" <td>523</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>926</th>\n", | ||
" <td>170v_2</td>\n", | ||
" <td>269</td>\n", | ||
" <td>278</td>\n", | ||
" <td>282</td>\n", | ||
" <td>16.728625</td>\n", | ||
" <td>15.827338</td>\n", | ||
" <td>15.957447</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>927</th>\n", | ||
" <td>170v_3</td>\n", | ||
" <td>497</td>\n", | ||
" <td>510</td>\n", | ||
" <td>536</td>\n", | ||
" <td>12.273642</td>\n", | ||
" <td>11.764706</td>\n", | ||
" <td>11.567164</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>928</th>\n", | ||
" <td>170v_4</td>\n", | ||
" <td>286</td>\n", | ||
" <td>296</td>\n", | ||
" <td>292</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.000000</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" <td>0.0</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"<p>929 rows × 10 columns</p>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" entry_id length_tc length_tcn length_tl percent_marginal_tc \\\n", | ||
"0 001r_1 292 302 299 95.205479 \n", | ||
"1 001r_2 133 140 131 96.240602 \n", | ||
"2 001r_3 1502 1513 1511 97.936085 \n", | ||
"3 001r_4 94 98 96 81.914894 \n", | ||
"4 001v_1 392 406 372 0.000000 \n", | ||
".. ... ... ... ... ... \n", | ||
"924 170r_6 631 647 580 35.340729 \n", | ||
"925 170v_1 498 507 523 0.000000 \n", | ||
"926 170v_2 269 278 282 16.728625 \n", | ||
"927 170v_3 497 510 536 12.273642 \n", | ||
"928 170v_4 286 296 292 0.000000 \n", | ||
"\n", | ||
" percent_marginal_tcn percent_marginal_tl percent_continued_tc \\\n", | ||
"0 95.033113 94.983278 0.0 \n", | ||
"1 95.000000 96.183206 0.0 \n", | ||
"2 97.951091 98.146923 0.0 \n", | ||
"3 82.653061 84.375000 0.0 \n", | ||
"4 0.000000 0.000000 0.0 \n", | ||
".. ... ... ... \n", | ||
"924 35.703246 36.724138 0.0 \n", | ||
"925 0.000000 0.000000 0.0 \n", | ||
"926 15.827338 15.957447 0.0 \n", | ||
"927 11.764706 11.567164 0.0 \n", | ||
"928 0.000000 0.000000 0.0 \n", | ||
"\n", | ||
" percent_continued_tcn percent_continued_tl \n", | ||
"0 0.0 0.0 \n", | ||
"1 0.0 0.0 \n", | ||
"2 0.0 0.0 \n", | ||
"3 0.0 0.0 \n", | ||
"4 0.0 0.0 \n", | ||
".. ... ... \n", | ||
"924 0.0 0.0 \n", | ||
"925 0.0 0.0 \n", | ||
"926 0.0 0.0 \n", | ||
"927 0.0 0.0 \n", | ||
"928 0.0 0.0 \n", | ||
"\n", | ||
"[929 rows x 10 columns]" | ||
] | ||
}, | ||
"execution_count": 53, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df = pd.DataFrame(columns=['entry_id', 'length_tc', 'length_tcn', 'length_tl',\n", | ||
" 'percent_marginal_tc', 'percent_marginal_tcn', 'percent_marginal_tl',\n", | ||
" 'percent_continued_tc', 'percent_continued_tcn', 'percent_continued_tl'])\n", | ||
"versions = ['tc', 'tcn', 'tl']\n", | ||
"i=0\n", | ||
"\n", | ||
"for identity, entry in manuscript.entries.items():\n", | ||
" if len(entry.margins) > 0 or 'continues=\"yes\"' in entry.text('tl', xml=True):\n", | ||
" percent_marginal, percent_continued = {}, {}\n", | ||
" for version in versions:\n", | ||
" percent_marginal[version] = find_percent_marginal(entry, version)\n", | ||
" percent_continued[version] = find_percent_continued(entry, version)\n", | ||
"\n", | ||
" df.loc[i] = [entry.identity, entry.length['tc'], entry.length['tcn'], entry.length['tl'],\n", | ||
" percent_marginal['tc'], percent_marginal['tcn'], percent_marginal['tl'],\n", | ||
" percent_continued['tc'], percent_continued['tcn'], percent_continued['tl']]\n", | ||
" i += 1\n", | ||
"df\n", | ||
" \n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.