Skip to content

Commit

Permalink
Improve punctuation spacing.
Browse files Browse the repository at this point in the history
  • Loading branch information
scossu committed May 24, 2024
1 parent 26c3513 commit 0f0bb3a
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 15 deletions.
32 changes: 28 additions & 4 deletions scriptshifter/hooks/general/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,21 @@
from scriptshifter.trans import MULTI_WS_RE


NORM1_RE = compile(r"([.,;:\)\]}])\s")
NORM2_RE = compile(r"(\S)([.,;:\)\]}])")
NORM3_RE = compile(r"\s([\)\]\}])")
NORM4_RE = compile(r"([\)\]\}])(\S)")
# Punctuation and brackets.
# TODO add angled brackets, opening and closing quotes, etc.
NORM1_RE = compile(r"\s([.,;:\)\]}])")
NORM2_RE = compile(r"([.,;:\)\]}])(\S)")
NORM3_RE = compile(r"([\(\[\{])\s")
NORM4_RE = compile(r"(\S)([\(\[\{])")

# "Straight" quotes.
# TODO Add single quotes.
NORM5_RE = compile(r"\"\s*([^\"]?)\s*\"")
NORM6_RE = compile(r"(\S)(\"[^\"]?\")")
NORM7_RE = compile(r"(\"[^\"]?\")(\S)")

# Space between symbols.
NORM8_RE = compile(r"([.,;:\(\[\{\)\]}])\s+([.,;:\(\[\{\)\]}])")

logger = getLogger(__name__)

Expand All @@ -22,10 +33,23 @@ def normalize_spacing_post_assembly(ctx):
"""
# De-duplicate whitespace.
logger.debug(f"Dest pre manipulation: {ctx.dest}")
# Remove white space between punctuation signs.
norm = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
# Remove space before punctuation and closing brackets.
norm = NORM1_RE.sub(r"\1", norm)
# Ensure space after punctuation and closing brackets.
norm = NORM2_RE.sub(r"\1 \2", norm)
# Remove space after opening brackets.
norm = NORM3_RE.sub(r"\1", norm)
# Ensure space before opening brackets.
norm = NORM4_RE.sub(r"\1 \2", norm)
# Remove space inside matched quotes.
norm = NORM5_RE.sub(r"\"\1\"", norm)
# Add space before opening double quote.
norm = NORM6_RE.sub(r"\1 \2", norm)
# Add space after closing double quote.
norm = NORM7_RE.sub(r"\1 \2", norm)
# Remove multiple white space characters.
# norm = NORM8_RE.sub(r"\1\2", norm)

return norm
22 changes: 11 additions & 11 deletions tests/data/script_samples/chinese.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,33 @@ chinese,廖忠俊. 著名狀元榜眼探花傳略,liao zhong jun. zhu ming zhuan
chinese,宋至道三年(997年),唃廝囉在高昌磨榆國出生,是吐蕃雅隆覺阿王系的後裔。,"song zhi dao san nian (997nian), gu si luo zai gao chang mo yu guo chu sheng, shi Tufan ya long jue a wang xi de hou yi.",,
chinese,文學革命論 / 陳獨秀 -- 人的文學 / 周作人 -- 新文學運動的意義 / 張我軍.,wen xue ge ming lun / chen du xiu -- ren de wen xue / zhou zuo ren -- xin wen xue yun dong de yi yi / zhang wo jun.,,
chinese,中國近代人口史料彙覽 = ZHONGGUO JINDAI RENKOU SHILIAO HUILAN / 《中國近代人口史料彙覽》編寫組編.,"Zhongguo jin dai ren kou shi liao hui lan = ZHONGGUO JINDAI RENKOU SHILIAO HUILAN / ""Zhongguo jin dai ren kou shi liao hui lan"" bian xie zu bian.",,
chinese,"桂林 : 广西师范大学出版社, 2022.","Guilin : Guangxi shi fan da xue chu ban she, 2022.",,
chinese,"桂林 : 广西师范大学出版社, 2022.","Guilin: Guangxi shi fan da xue chu ban she, 2022.",,
chinese,王正强,"Wang, Zhengqiang","{""marc_field"": ""100""}","MARC 100 field, surname entry requires a comma that is not in the vernacular."
chinese,"秦腔通史 = QINQIANG TONGSHI / 王正强, 周琪著.","Qin qiang tong shi = QINQIANG TONGSHI / Wang Zhengqiang, Zhou Qi zhu.",,
chinese,"困學紀聞 : 全校本 / [宋]王應麟著 ; [清]翁元圻等注 ; 欒保羣, 田松青, 呂宗力校點.","Kun xue ji wen : quan jiao ben / [Song] Wang Yinglin zhu ; [Qing] Weng Yuanqi deng zhu ; Luan Baoqun, Tian Songqing, Lü Zongli jiao dian.",,
chinese,"困學紀聞 : 全校本 / [宋]王應麟著 ; [清]翁元圻等注 ; 欒保羣, 田松青, 呂宗力校點.","Kun xue ji wen: quan jiao ben / [Song] Wang Yinglin zhu; [Qing] Weng Yuanqi deng zhu; Luan Baoqun, Tian Songqing, Lü Zongli jiao dian.",,
chinese,"修訂版, 第 1 版.","Xiu ding ban, di 1 ban.",,
chinese,"上海市 : 上海古籍出版社, 2023.","Shanghai Shi : Shanghai gu ji chu ban she, 2023.",,
chinese,"上海市 : 上海古籍出版社, 2023.","Shanghai Shi: Shanghai gu ji chu ban she, 2023.",,
chinese,遼代墓誌校注 = Collation and Annotation of Epitaphs in the Liao Dynasty / 周阿根校注.,Liao dai mu zhi jiao zhu = Collation and Annotation of Epitaphs in the Liao Dynasty / Zhou Agen jiao zhu.,,
chinese,"天津市 : 天津古籍出版社, 2022.","Tianjin Shi : Tianjin gu ji chu ban she, 2022.",,
chinese,"天津市 : 天津古籍出版社, 2022.","Tianjin Shi: Tianjin gu ji chu ban she, 2022.",,
chinese,国家社科基金后期资助项目(Tianjin gu ji chu ban she) = Guojia sheke jijin houqi zizhu xiangmu,Guo jia she ke ji jin hou qi zi zhu xiang mu (Tianjin gu ji chu ban she) = Guojia sheke jijin houqi zizhu xiangmu,,
chinese,楊于庭集 / (明)楊于庭撰 ; 政協全椒縣委員會編.,Yang Yuting ji / (Ming) Yang Yuting zhuan ; Zheng xie Quanjiao Xian wei yuan hui bian.,,
chinese,"北京市 : 國家圖書館出版社, 2020.","Beijing Shi : Guo jia tu shu guan chu ban she, 2020.",,
chinese,楊于庭集 / (明)楊于庭撰 ; 政協全椒縣委員會編.,Yang Yuting ji / (Ming) Yang Yuting zhuan; Zheng xie Quanjiao Xian wei yuan hui bian.,,
chinese,"北京市 : 國家圖書館出版社, 2020.","Beijing Shi: Guo jia tu shu guan chu ban she, 2020.",,
chinese,全椒古代典籍叢書,Quanjiao gu dai dian ji cong shu,,
chinese,春秋質疑十二卷 -- 楊道行集三十三卷.,Chun qiu zhi yi shi er juan -- Yang Daoxing ji san shi san juan.,,
chinese,"楊于庭, jin shi 1580. 春秋質疑.","Yang, Yuting, jin shi 1580. Chun qiu zhi yi.","{""marc_field"": ""700""}","MARC 700 field, comma needs to be added into the surname entry that is not in the vernacular."
chinese,"中國人民政治協商會議. 全椒縣委員會, editor.","Zhongguo ren min zheng zhi xie shang hui yi. Quanjiao Xian wei yuan hui, editor.",,
chinese,漢魏六朝隋碑誌索引 / 劉琴麗編著.,Han Wei Liu chao sui bei zhi suo yin / Liu Qinli bian zhu.,,
chinese,"北京 : 中国社会科学出版社, 2019.","Beijing : Zhongguo she hui ke xue chu ban she, 2019.",,
chinese,"北京 : 中国社会科学出版社, 2019.","Beijing: Zhongguo she hui ke xue chu ban she, 2019.",,
chinese,中國社會科學院歷史研究所專刊,Zhongguo she hui ke xue yuan li shi yan jiu suo zhuan kan,,
chinese,清代河務檔案 = QINGDAI HEWU DANG'AN / 《清代河務檔案》編寫組編.,"Qing dai he wu dang an = QINGDAI HEWU DANG'AN / ""Qing dai he wu dang an"" bian xie zu bian.",,
chinese,"桂林 : 广西师范大学出版社, 2022.","Guilin : Guangxi shi fan da xue chu ban she, 2022.",,
chinese,"桂林 : 广西师范大学出版社, 2022.","Guilin: Guangxi shi fan da xue chu ban she, 2022.",,
chinese,"《清代河務檔案》編寫組, editor.","""Qing dai he wu dang an"" bian xie zu, editor.",,
chinese,"張穆全集 / 張正明, 安介生主編.","Zhang Mu quan ji / Zhang Zhengming, An Jiesheng zhu bian.",,
chinese,"太原市 : 三晋出版社, 2019.","Taiyuan Shi : San Jin chu ban she, 2019.",,
chinese,"太原市 : 三晋出版社, 2019.","Taiyuan Shi: San Jin chu ban she, 2019.",,
chinese,欒保羣,"Luan, Baoqun","{""marc_field"": ""700""}","MARC 700 field, comma needs to be added into the surname entry that is not in the vernacular."
chinese,呂宗力,"Lü, Zongli","{""marc_field"": ""700""}","MARC 700 field, comma needs to be added into the surname entry that is not in the vernacular."
chinese,袁昶友朋書札 / (清)袁昶輯 ; 謝冬榮等整理.,Yuan Chang you peng shu zha / (Qing) Yuan Chang ji ; Xie Dongrong deng zheng li.,,
chinese,"南京市 : 鳳凰出版社, 2021.","Nanjing Shi : Feng huang chu ban she, 2021.",,
chinese,袁昶友朋書札 / (清)袁昶輯 ; 謝冬榮等整理.,Yuan Chang you peng shu zha / (Qing) Yuan Chang ji; Xie Dongrong deng zheng li.,,
chinese,"南京市 : 鳳凰出版社, 2021.","Nanjing Shi: Feng huang chu ban she, 2021.",,
chinese,漢代簡牘草書整理與研究 / 李洪財著,Han dai jian du cao shu zheng li yu yan jiu / Li Hongcai zhu.,,
chinese,‡6 505-06 ‡g 第 1 冊. ‡t 髫齡瑣記 二卷 -- ‡g 第 1-2 冊. ‡t 廿年略記 六卷-- ‡g 第 3 冊. ‡t 過眼雲煙 四卷 -- ‡g 第 4 冊. ‡t 曙光集 三卷. ‡t 附錄. ‡t 趙文郁詩草七種. ‡t 芸窗雜錄不分卷. ‡t 文法指南不分卷.,‡6 880-06 ‡g Di 1 ce. ‡t Tiao ling suo ji er juan -- ‡g Di 1-2 ce. ‡t Nian nian lüe ji liu juan-- ‡g Di 3 ce. ‡t Guo yan yun yan si juan -- ‡g Di 4 ce. ‡t Shu guang ji san juan. ‡t Fu lu. ‡t Zhao Wenyu shi cao qi zhong. ‡t Yun chuang za lu bu fen juan. ‡t Wen fa zhi nan bu fen zhuan.,,MARC designators left in.
chinese,本书收录赵文郁写于不同时期的日记十五卷,其中《髫龄琐记》二卷,记录了他十八岁以前的经历;《过眼云烟》(4册)、十八岁以后教书谋生的经历见于六卷的《二十年略记》;伪满洲国统治热河省期间,...,"Ben shu shou lu Zhao Wenyu xie yu bo tong shi qi de ri ji shi wu juan, qi zhong ""Ting ling suo ji"" er juan, ji lu le ta shi ba sui yi qian de jing li; ""Guo yan yun yan"" (4 ce), shi ba sui yi hou jiao shu mou sheng de jing li jian yu liu juan de “Er shi nian lüe ji;” wei Manzhouguo tong zhi Rehe Sheng qi jian, …",,
Expand Down

0 comments on commit 0f0bb3a

Please sign in to comment.