-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwhooshing.py
62 lines (45 loc) · 1.67 KB
/
whooshing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
from tqdm import tqdm
from util import data_io
from whoosh import index
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import TEXT, ID, Schema
from whoosh.qparser import QueryParser, MultifieldParser
def build_index(data, schema, index_dir="indexdir"):
if not os.path.exists(index_dir):
os.mkdir(index_dir)
ix = index.create_in(index_dir, schema)
writer = ix.writer()
for d in tqdm(data):
writer.add_document(**d)
writer.commit()
# fmt: off
FIELDS = ["Entscheidungsgründe","Abweichende Meinung","Tatbestand","Sonstiger Orientierungssatz","Leitsatz","Tenor","Gründe","Orientierungssatz","title","subtitle"]
# fmt: on
def process_field_name(f: str):
return f.replace(" ", "_")
def build_schema_and_corpus():
fields = {process_field_name(f): TEXT(stored=True, lang="de") for f in FIELDS}
schema = Schema(aktenzeichen=ID(stored=True), **fields,)
file = "BverfG.jsonl.gz"
data = (
{
process_field_name(k): v
for k, v in d.items()
if k in FIELDS + ["aktenzeichen"]
}
for d in data_io.read_jsonl(file)
)
return schema, data
if __name__ == "__main__":
INDEX_DIR = "bverfg_index"
if not os.path.isdir(INDEX_DIR):
schema, data = build_schema_and_corpus()
build_index(data, schema, index_dir=INDEX_DIR)
print("done building corpus")
ix = index.open_dir(INDEX_DIR)
with ix.searcher() as searcher:
qp = MultifieldParser(FIELDS, schema=ix.schema)
q = qp.parse("185 StGB")
results = searcher.search(q, limit=None)
data_io.write_jsonl("result.jsonl", (r.fields() for r in results))