-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert-to-json.py
81 lines (67 loc) · 2.24 KB
/
convert-to-json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re
import json
import os
from os import listdir
from os.path import isfile, join
from pathlib import Path
def main():
book_dir = "books"
files = [f for f in listdir(book_dir) if isfile(join(book_dir, f))]
data = {}
for file_name in files:
file_path = os.path.join(book_dir, file_name)
add_file_to_json(file_path, data)
write_to_json(data)
def add_file_to_json(filename, data):
file_handle = open(filename, "r", encoding="utf8")
book_name = Path(filename).stem
lines = file_handle.read().splitlines()
chapter_name = lines[0]
para_num = 0
chapter_num = 0
paragraph = ""
preline = ""
book_data = []
skip_next = False
for i in range(len(lines[1:])):
line = lines[i]
# If the line is Chapter [NUM] then use the next (non-empty)
# line as the chapter name.
match_chapter = re.match(r'Chapter ([A-Za-z-]|[0-9])+$', line)
if skip_next:
skip_next = False
continue
if line and match_chapter:
chapter_num += 1
chapter_name = next_non_empty_line(i, lines)
skip_next = True
else:
if not line.strip():
if preline.strip():
paragraph = paragraph.replace('\n', '')
para_num += 1
# print("('{}', '{}', '{}', '{}', '{}')"
# .format(book, book_num, chapter_num,
# para_num, paragraph))
book_data.append({
'chapter_num': chapter_num,
'chapter_name': chapter_name,
'paragraph': paragraph,
'book_name': book_name})
paragraph = ""
else:
paragraph += line
preline = line
data[book_name] = book_data
def next_non_empty_line(i, lines):
while i < len(lines) - 1:
line = lines[i + 1]
if line != "" and not line.isspace():
return line.replace("'", "''")
i += 1
return ""
def write_to_json(data):
with open('src/books.json', 'w') as file:
json.dump(data, file)
if __name__ == '__main__':
main()