-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathRecogniser.py
2891 lines (1961 loc) · 129 KB
/
Recogniser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
#
# Author: Roland Pihlakas, 2023 - 2024
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
#
if __name__ == '__main__':
print("Starting...")
import os
import sys
import traceback
import httpcore
import httpx
import time
from configparser import ConfigParser
# import spacy
# from spacy import displacy # load it only when rendering is requested, since this package loads slowly
import re
import regex
from collections import defaultdict, Counter, OrderedDict
import hashlib
import string
import base64
from bisect import bisect_right
import statistics
import rapidfuzz.process
import rapidfuzz.fuzz
from fuzzysearch import find_near_matches
from ncls import NCLS
import json_tricks
import json # json.decoder.JSONDecodeError
# import openai
import tenacity # for exponential backoff
import openai_async
import tiktoken
# organization = os.getenv("OPENAI_API_ORG")
api_key = os.getenv("OPENAI_API_KEY")
# openai.organization = organization
# openai.api_key = api_key
from Utilities import init_logging, safeprint, print_exception, loop, debugging, is_dev_machine, data_dir, Timer, read_file, save_file, read_raw, save_raw, read_txt, save_txt, strtobool, async_cached, async_cached_encrypted
from TimeLimit import time_limit
# if __name__ == "__main__":
# init_logging(os.path.basename(__file__), __name__, max_old_log_rename_tries = 1)
if __name__ == "__main__":
os.chdir(os.path.dirname(os.path.realpath(__file__)))
if is_dev_machine:
from pympler import asizeof
letters_regex = regex.compile(r'\p{L}') # matches unicode letters only, not digits # regex has better Unicode support than re
def remove_quotes(text):
return text.replace("'", "").replace('"', '')
def remove_percent(text):
if text[-1:] == "%":
return text[:-1]
else:
return text
def rotate_list(list, n):
return list[n:] + list[:n]
def get_config():
config = ConfigParser(inline_comment_prefixes=("#", ";")) # by default, inline comments were not allowed
config.read('Recogniser.ini')
config_section = "Recogniser"
gpt_model = remove_quotes(config.get(config_section, "GPTModel", fallback="gpt-3.5-turbo-16k")).strip()
enable_auto_override_to_gpt4_32k = strtobool(remove_quotes(config.get(config_section, "EnableAutoOverrideToGPT4_32k", fallback="false")))
gpt_timeout = int(remove_quotes(config.get(config_section, "GPTTimeoutInSeconds", fallback="600")).strip())
extract_message_indexes = strtobool(remove_quotes(config.get(config_section, "ExtractMessageIndexes", fallback="false")))
extract_line_numbers = strtobool(remove_quotes(config.get(config_section, "ExtractLineNumbers", fallback="false")))
do_open_ended_analysis = strtobool(remove_quotes(config.get(config_section, "DoOpenEndedAnalysis", fallback="true")))
do_closed_ended_analysis = strtobool(remove_quotes(config.get(config_section, "DoClosedEndedAnalysis", fallback="true")))
keep_unexpected_labels = strtobool(remove_quotes(config.get(config_section, "KeepUnexpectedLabels", fallback="true")))
chart_type = remove_quotes(config.get(config_section, "ChartType", fallback="radar")).strip()
render_output = strtobool(remove_quotes(config.get(config_section, "RenderOutput", fallback="false")))
create_pdf = strtobool(remove_quotes(config.get(config_section, "CreatePdf", fallback="true")))
treat_entire_text_as_one_person = strtobool(remove_quotes(config.get(config_section, "TreatEntireTextAsOnePerson", fallback="false"))) # TODO
anonymise_names = strtobool(remove_quotes(config.get(config_section, "AnonymiseNames", fallback="false")))
anonymise_numbers = strtobool(remove_quotes(config.get(config_section, "AnonymiseNumbers", fallback="false")))
named_entity_recognition_model = remove_quotes(config.get(config_section, "NamedEntityRecognitionModel", fallback="en_core_web_sm")).strip()
encrypt_cache_data = strtobool(remove_quotes(config.get(config_section, "EncryptCacheData", fallback="true")))
split_messages_by = remove_quotes(config.get(config_section, "SplitMessagesBy", fallback="")).strip()
keep_message_newlines_in_llm_input = strtobool(remove_quotes(config.get(config_section, "KeepMessageNewLinesInLLMInput", fallback="false")))
ignore_incorrectly_assigned_citations = strtobool(remove_quotes(config.get(config_section, "IgnoreIncorrectlyAssignedCitations", fallback="false")))
allow_multiple_citations_per_message = strtobool(remove_quotes(config.get(config_section, "AllowMultipleCitationsPerMessage", fallback="true")))
citation_lookup_time_limit = float(remove_quotes(config.get(config_section, "CitationLookupTimeLimit", fallback="0.1")))
citation_lookup_outer_time_limit = float(remove_quotes(config.get(config_section, "CitationLookupOuterTimeLimit", fallback="1.0")))
temperature = float(remove_quotes(config.get(config_section, "Temperature", fallback="0.0")))
sample_count = int(remove_quotes(config.get(config_section, "SampleCount", fallback="1")))
default_label_treshold_sample_percent = float(remove_percent(remove_quotes(config.get(config_section, "DefaultLabelThresholdSamplePercent", fallback="50"))))
result = {
"gpt_model": gpt_model,
"enable_auto_override_to_gpt4_32k": enable_auto_override_to_gpt4_32k,
"gpt_timeout": gpt_timeout,
"extract_message_indexes": extract_message_indexes,
"extract_line_numbers": extract_line_numbers,
"do_open_ended_analysis": do_open_ended_analysis,
"do_closed_ended_analysis": do_closed_ended_analysis,
"keep_unexpected_labels": keep_unexpected_labels,
"chart_type": chart_type,
"render_output": render_output,
"create_pdf": create_pdf,
"treat_entire_text_as_one_person": treat_entire_text_as_one_person,
"anonymise_names": anonymise_names,
"anonymise_numbers": anonymise_numbers,
"named_entity_recognition_model": named_entity_recognition_model,
"encrypt_cache_data": encrypt_cache_data,
"split_messages_by": split_messages_by,
"keep_message_newlines_in_llm_input": keep_message_newlines_in_llm_input,
"ignore_incorrectly_assigned_citations": ignore_incorrectly_assigned_citations,
"allow_multiple_citations_per_message": allow_multiple_citations_per_message,
"citation_lookup_time_limit": citation_lookup_time_limit,
"citation_lookup_outer_time_limit": citation_lookup_outer_time_limit,
"temperature": temperature,
"sample_count": sample_count,
"default_label_treshold_sample_percent": default_label_treshold_sample_percent,
}
return result
#/ get_config()
## https://platform.openai.com/docs/guides/rate-limits/error-mitigation
@tenacity.retry(wait=tenacity.wait_random_exponential(min=1, max=60), stop=tenacity.stop_after_attempt(6)) # TODO: config parameters
async def completion_with_backoff(gpt_timeout, **kwargs): # TODO: ensure that only HTTP 429 is handled here
# return openai.ChatCompletion.create(**kwargs)
qqq = True # for debugging
attempt_number = completion_with_backoff.retry.statistics["attempt_number"]
timeout_multiplier = 2 ** (attempt_number-1) # increase timeout exponentially
try:
timeout = gpt_timeout * timeout_multiplier
safeprint(f"Sending OpenAI API request... Using timeout: {timeout} seconds")
openai_response = await openai_async.chat_complete(
api_key,
timeout = timeout,
payload = kwargs
)
safeprint("Done OpenAI API request.")
openai_response = json_tricks.loads(openai_response.text)
if openai_response.get("error"):
if openai_response["error"]["code"] == 502 or openai_response["error"]["code"] == 503: # Bad gateway or Service Unavailable
raise httpcore.NetworkError(openai_response["error"]["message"])
else:
raise Exception(str(openai_response["error"]["code"]) + " : " + openai_response["error"]["message"]) # TODO: use a more specific exception type
# NB! this line may also throw an exception if the OpenAI announces that it is overloaded # TODO: do not retry for all error messages
response_content = openai_response["choices"][0]["message"]["content"]
finish_reason = openai_response["choices"][0]["finish_reason"]
return (response_content, finish_reason)
except Exception as ex: # httpcore.ReadTimeout
t = type(ex)
if (t is httpcore.ReadTimeout or t is httpx.ReadTimeout): # both exception types have occurred
if attempt_number < 6: # TODO: config parameter
safeprint("Read timeout, retrying...")
else:
safeprint("Read timeout, giving up")
elif (t is httpcore.NetworkError):
if attempt_number < 6: # TODO: config parameter
safeprint("Network error, retrying...")
else:
safeprint("Network error, giving up")
elif (t is json.decoder.JSONDecodeError):
if attempt_number < 6: # TODO: config parameter
safeprint("Response format error, retrying...")
else:
safeprint("Response format error, giving up")
else: #/ if (t ishttpcore.ReadTimeout
msg = str(ex) + "\n" + traceback.format_exc()
print_exception(msg)
#/ if (t ishttpcore.ReadTimeout
raise
#/ except Exception as ex:
#/ async def completion_with_backoff(gpt_timeout, **kwargs):
def get_encoding_for_model(model):
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
safeprint("Warning: model not found. Using cl100k_base encoding.")
encoding = tiktoken.get_encoding("cl100k_base")
return encoding
#/ def get_encoding_for_model(model):
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, model, encoding = None):
"""Return the number of tokens used by a list of messages."""
if encoding is None:
encoding = get_encoding_for_model(model)
if model in {
"gpt-3.5-turbo-0613",
"gpt-3.5-turbo-16k-0613",
"gpt-4-0314",
"gpt-4-0613",
"gpt-4-32k-0314",
"gpt-4-32k-0613",
}:
tokens_per_message = 3
tokens_per_name = 1
elif model == "gpt-3.5-turbo-0301":
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
tokens_per_name = -1 # if there's a name, the role is omitted
elif "gpt-3.5-turbo-16k" in model: # roland
# safeprint("Warning: gpt-3.5-turbo-16k may update over time. Returning num tokens assuming gpt-3.5-turbo-16k-0613.")
return num_tokens_from_messages(messages, model="gpt-3.5-turbo-16k-0613", encoding=encoding)
elif "gpt-3.5-turbo" in model:
# safeprint("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613", encoding=encoding)
elif "gpt-4-32k" in model: # roland
# safeprint("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-32k-0613.")
return num_tokens_from_messages(messages, model="gpt-4-32k-0613", encoding=encoding)
elif "gpt-4" in model:
# safeprint("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
return num_tokens_from_messages(messages, model="gpt-4-0613", encoding=encoding)
else:
#raise NotImplementedError(
# f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
#)
safeprint(f"num_tokens_from_messages() is not implemented for model {model}")
# just take some conservative assumptions here
tokens_per_message = 4
tokens_per_name = 1
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name":
num_tokens += tokens_per_name
#/ for key, value in message.items():
#/ for message in messages:
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
return num_tokens
#/ def num_tokens_from_messages(messages, model, encoding=None):
def get_max_tokens_for_model(model_name):
# TODO: config
if model_name == "gpt-4-turbo-preview": # https://platform.openai.com/docs/models/gpt-4
max_tokens = 128000
elif model_name == "gpt-4-0125-preview": # https://platform.openai.com/docs/models/gpt-4
max_tokens = 128000
elif model_name == "gpt-4-1106-preview": # https://platform.openai.com/docs/models/gpt-4
max_tokens = 128000
elif model_name == "gpt-4-32k": # https://platform.openai.com/docs/models/gpt-4
max_tokens = 32768
elif model_name == "gpt-3.5-turbo-16k": # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 16384
elif model_name == "gpt-4": # https://platform.openai.com/docs/models/gpt-4
max_tokens = 8192
elif model_name == "gpt-3.5-turbo": # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 4096
else:
max_tokens = 4096
return max_tokens
#/ def get_max_tokens_for_model(model_name):
async def run_llm_analysis_uncached(model_name, encoding, gpt_timeout, messages, continuation_request, temperature = 0, sample_index = 0, enable_auto_override_to_gpt4_32k = False): # sample_index is used only for cache keying and not inside this function
responses = []
max_tokens = get_max_tokens_for_model(model_name)
enable_auto_override_to_gpt4_turbo = True # GPT-4-Turbo is widely available, so no need for config
with Timer("Sending OpenAI API requests"):
continue_analysis = True
too_long = False
model_upgraded = False
while continue_analysis:
num_input_tokens = num_tokens_from_messages(messages, model_name, encoding)
safeprint(f"num_input_tokens: {num_input_tokens} max_tokens: {max_tokens}")
#if num_tokens <= 0:
# break
# assert(num_input_tokens < max_tokens)
# TODO: configuration for model override thresholds and for override model names
# TODO: how to handle finish_reason == "length" in case the model is gpt-4-32k?
if enable_auto_override_to_gpt4_32k and num_input_tokens >= (8192 * 1.5) and model_name == "gpt-3.5-turbo-16k": # max_tokens == 16384: # current model: "gpt-3.5-turbo-16k"
if not too_long or model_upgraded:
assert(False)
model_name = "gpt-4-32k" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 32768
safeprint(f"Overriding model with {model_name} due to input token count")
elif enable_auto_override_to_gpt4_turbo and num_input_tokens >= (8192 * 1.5) and model_name == "gpt-3.5-turbo-16k": # max_tokens == 16384: # current model: "gpt-3.5-turbo-16k"
if not too_long or model_upgraded:
assert(false)
model_name = "gpt-4-turbo-preview" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 128000
safeprint(f"overriding model with {model_name} due to input token count")
elif enable_auto_override_to_gpt4_turbo and num_input_tokens >= (16384 * 1.5) and model_name == "gpt-4-32k": # max_tokens == 16384: # current model: "gpt-3.5-turbo-16k"
if not too_long or model_upgraded:
assert(false)
model_name = "gpt-4-turbo-preview" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 128000
safeprint(f"overriding model with {model_name} due to input token count")
elif num_input_tokens >= (4096 * 1.5) and model_name == "gpt-4": # max_tokens == 8192: # current model: "gpt-4"
if not too_long or model_upgraded:
assert(False)
if enable_auto_override_to_gpt4_32k:
model_name = "gpt-4-32k" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 32768
# comment-out: do not override gpt-4 with gpt-4-turbo since this model has only 4096 output tokens
# elif enable_auto_override_to_gpt4_turbo:
# model_name = "gpt-4-turbo-preview" # https://platform.openai.com/docs/models/gpt-3-5
# max_tokens = 128000
else:
model_name = "gpt-3.5-turbo-16k" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 16384
safeprint(f"Overriding model with {model_name} due to input token count")
elif num_input_tokens >= (2048 * 1.5) and model_name == "gpt-3.5-turbo": # max_tokens <= 4096: # current model: "gpt-3.5-turbo"
if not too_long or model_upgraded:
assert(False)
model_name = "gpt-3.5-turbo-16k" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 16384
safeprint(f"Overriding model with {model_name} due to input token count")
if model_name == "gpt-4-turbo-preview":
max_tokens2 = 4096
else:
reserve_tokens = 100 # just in case to not trigger OpenAI API errors # TODO: config
max_tokens2 = max_tokens - num_input_tokens - 1 - reserve_tokens # need to subtract the number of input tokens, else we get an error from OpenAI # NB! need to substract an additional 1 token else OpenAI is still not happy: "This model's maximum context length is 8192 tokens. However, you requested 8192 tokens (916 in the messages, 7276 in the completion). Please reduce the length of the messages or completion."
assert(max_tokens2 > 0)
time_start = time.time()
(response_content, finish_reason) = await completion_with_backoff(
gpt_timeout,
model = model_name,
messages = messages,
# functions = [], # if no functions are in array then the array should be omitted, else error occurs
# function_call = "none", # 'function_call' is only allowed when 'functions' are specified
n = 1,
stream = False, # TODO
# user = "", # TODO
temperature = temperature, # 1, 0 means deterministic output # TODO: increase in case of sampling the GPT multiple times per same text
top_p = 1,
max_tokens = max_tokens2,
presence_penalty = 0,
frequency_penalty = 0,
# logit_bias = None,
)
time_elapsed = time.time() - time_start
responses.append(response_content)
too_long = (finish_reason == "length")
messages2 = list(messages) # clone
messages2.append({"role": "assistant", "content": response_content}) # need messages2 immediately in order to compute num_total_tokens
num_total_tokens = num_tokens_from_messages(messages2, model_name, encoding)
num_output_tokens = num_total_tokens - num_input_tokens
safeprint(f"num_total_tokens: {num_total_tokens} num_output_tokens: {num_output_tokens} max_tokens: {max_tokens} performance: {(num_output_tokens / time_elapsed)} output_tokens/sec")
if too_long:
# first switch to more powerful model without continuation prompt, by instead repeating original prompt on a more powerful model. Only if model upgrade does not help, use continuation prompt.
if enable_auto_override_to_gpt4_32k and model_name == "gpt-3.5-turbo-16k": # max_tokens == 16384: # current model: "gpt-3.5-turbo-16k"
model_upgraded = True
model_name = "gpt-4-32k" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 32768
safeprint(f"Overriding model with {model_name} due to output token count")
# comment-out: cannot override with gpt-4-turbo for output token increase since this model has only 4096 output tokens
# elif enable_auto_override_to_gpt4_turbo and model_name == "gpt-3.5-turbo-16k": # max_tokens == 16384: # current model: "gpt-3.5-turbo-16k"
# model_upgraded = True
# model_name = "gpt-4-turbo-preview" # https://platform.openai.com/docs/models/gpt-3-5
# max_tokens = 128000
# safeprint(f"Overriding model with {model_name} due to output token count")
elif model_name == "gpt-4": # max_tokens == 8192: # current model: "gpt-4"
model_upgraded = True
if enable_auto_override_to_gpt4_32k:
model_name = "gpt-4-32k" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 32768
# comment-out: cannot override with gpt-4-turbo for output token increase since this model has only 4096 output tokens
# elif enable_auto_override_to_gpt4_turbo:
# model_name = "gpt-4-turbo-preview" # https://platform.openai.com/docs/models/gpt-3-5
# max_tokens = 128000
else:
model_name = "gpt-3.5-turbo-16k" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 16384
safeprint(f"Overriding model with {model_name} due to output token count")
elif model_name == "gpt-3.5-turbo": # max_tokens <= 4096: # current model: "gpt-3.5-turbo"
model_upgraded = True
model_name = "gpt-3.5-turbo-16k" # https://platform.openai.com/docs/models/gpt-3-5
max_tokens = 16384
safeprint(f"Overriding model with {model_name} due to output token count")
else:
model_upgraded = False
messages = messages2
messages.append({"role": "assistant", "content": continuation_request}) # TODO: test this functionality. Should it be in user role instead?
safeprint(f"Using continuation prompt due to output token count")
else: #/ if too_long:
messages = messages2
continue_analysis = False
#/ if too_long:
#/ while continue_analysis:
#/ with Timer("Sending OpenAI API requests"):
response = "\n".join(responses)
return response
#/ async def run_llm_analysis_uncached():
async def run_llm_analysis(config, model_name, encoding, gpt_timeout, messages, continuation_request, temperature = 0, sample_index = 0, enable_cache = True, chunk_index = None, input_file = None, theme = None, querytype = None):
encrypt_cache_data = config["encrypt_cache_data"]
enable_auto_override_to_gpt4_32k = config["enable_auto_override_to_gpt4_32k"] # TODO: move to function argument?
kwargs = {}
if enable_auto_override_to_gpt4_32k: # else do not bust caches of old code
kwargs.update({
"enable_auto_override_to_gpt4_32k": enable_auto_override_to_gpt4_32k
})
cache_version = 1
if encrypt_cache_data:
result = await async_cached_encrypted(cache_version if enable_cache else None, run_llm_analysis_uncached, model_name, encoding, gpt_timeout, messages, continuation_request, temperature = temperature, sample_index = sample_index, **kwargs)
else:
result = await async_cached(cache_version if enable_cache else None, run_llm_analysis_uncached, model_name, encoding, gpt_timeout, messages, continuation_request, temperature = temperature, sample_index = sample_index, **kwargs)
if result is not None:
create_opensource_llm_trainingdata = True # TODO: config
if create_opensource_llm_trainingdata:
GPT_cache_filename = "GPT_cache#file=" + input_file.replace("\\", "--").replace("/", "--") + "#theme=" + theme + "#sample=" + str(sample_index) + "#chunk=" + str(chunk_index) + "#querytype=" + querytype + ".json"
GPT_cache_filename = os.path.join("CachedGPTResponses", GPT_cache_filename) # TODO: config for output folder
if not os.path.exists(GPT_cache_filename):
GPT_cached_data = list(messages) # clone
GPT_cached_data.append({
"content": result,
"role": "assistant"
})
GPT_cache_json = json_tricks.dumps(GPT_cached_data, indent=2) # json_tricks preserves dictionary orderings
dirname = os.path.join(data_dir, os.path.dirname(GPT_cache_filename))
if not os.path.exists(dirname):
os.makedirs(dirname) # TODO: wait until the dir is available, else file write will fail under Windows
await save_txt(GPT_cache_filename, GPT_cache_json, quiet = True, make_backup = True, append = False)
#/ if not os.path.exists(GPT_cache_filename):
#/ if create_opensource_llm_trainingdata:
else:
qqq = True # for debugging
#/ if result is not None:
return result
#/ async def run_llm_analysis():
def remove_comments(text):
# re.sub does global search and replace, replacing all matching instances
# keep text before comment, if it exists
text = re.sub(r"(^|[\r\n]+[^#]*)\s*#[^\r\n]*", r"\1", text) # NB! keep the newlines before the comment in order to preserve line indexing # TODO: ensure that this does not affect LLM analysis
return text
#/ def remove_comments(text):
def sanitise_input(text):
# re.sub does global search and replace, replacing all matching instances
text = re.sub(r"[{\[]", "(", text)
text = re.sub(r"[}\]]", ")", text)
text = re.sub(r"-{2,}", "-", text) # TODO: use some other separator between system instruction and user input
# text = re.sub(r"-{3,}", "--", text) # TODO: use some other separator between system instruction and user input
text = re.sub(r"[^\S\r\n]+", " ", text) # replace all repeating whitespace which is not newline with a single space
text = re.sub(r" ?[\r\n]+ ?", "\n", text) # replace multiple consequtive newlines with a single newline, and drop any whitespace around the newlines
return text
#/ def sanitise_input(text):
def anonymise_uncached(user_input, anonymise_names, anonymise_numbers, ner_model, use_only_numeric_replacements=False):
with Timer("Loading Spacy"):
import spacy # load it only when anonymisation is requested, since this package loads slowly
spacy.prefer_gpu()
with Timer("Loading Named Entity Recognition model"):
NER = spacy.load(ner_model) # TODO: config setting
entities = NER(user_input)
letters = string.ascii_uppercase if not use_only_numeric_replacements else ""
next_available_replacement_letter_index = 0
result = ""
prev_ent_end = 0
entities_dict = {}
reserved_replacement_letter_indexes = set()
active_replacements = ""
if anonymise_names:
active_replacements += "Person|Group|Building|Organisation|Area|Location|Event|Language"
if anonymise_names and anonymise_numbers:
active_replacements += "|"
if anonymise_numbers:
active_replacements += "Money Amount|Quantity|Number"
if len(active_replacements) > 0:
# detect any pre-existing anonymous entities like Person A, Person B in the input text and reserve these letters in the dict so that they are not reused
# TODO: match also strings like "Person 123"
re_matches = re.findall(r"(^|\s)(" + active_replacements + ")(\s+)([" + re.escape(letters) + "]|[0-9]+)(\s|:|$)", user_input) # NB! capture also numbers starting with 0 so that for example number 09 still ends up reserving number 9.
for re_match in re_matches:
replacement = re_match[1]
space = re_match[2]
letter = re_match[3]
if letter.isalpha():
replacement_letter_index = ord(letter) - ord("A")
reserved_replacement_letter_indexes.add(replacement_letter_index)
else:
intval = int(letter)
if intval == 0: # this algorithm does not produce replacement number 0, so we do not need to reserve it, also reserving it would result it reserving last letter from alphabet instead
continue
replacement_letter_index = len(letters) + intval - 1 # NB! -1 since replacement numbers start from 1 in the line "replacement_letter = str(replacement_letter_index + 1)"
reserved_replacement_letter_indexes.add(replacement_letter_index)
entities_dict[replacement + " " + letter] = replacement_letter_index # use space as separator to normalise the dictionary keys so that same entity with different space formats gets same replacement
# entities_dict[replacement + space + letter] = replacement_letter_index
#/ for re_match in re_matches:
#/ if len(active_replacements) > 0:
for phase in range(0, 2): # two phases: 1) counting unique entities, 2) replacing them
for word in entities.ents:
text_original = word.text
label = word.label_
start_char = word.start_char
end_char = word.end_char
text_normalised = re.sub(r"\s+", " ", text_original) # normalise the dictionary keys so that same entity with different space formats gets same replacement
if phase == 0 and text_normalised in entities_dict: # Spacy detects texts like "Location C" as entities
continue
if label == "PERSON":
replacement = "Person" if anonymise_names else None
elif label == "NORP":
replacement = "Group" if anonymise_names else None
elif label == "FAC":
replacement = "Building" if anonymise_names else None
elif label == "ORG":
replacement = "Organisation" if anonymise_names else None
elif label == "GPE":
replacement = "Area" if anonymise_names else None
elif label == "LOC":
replacement = "Location" if anonymise_names else None
elif label == "PRODUCT":
replacement = None # "Product"
elif label == "EVENT":
replacement = "Event" if anonymise_names else None
elif label == "WORK_OF_ART":
replacement = None # "Work of art"
elif label == "LAW":
replacement = None # "Law"
elif label == "LANGUAGE":
replacement = "Language" if anonymise_names else None
elif label == "DATE":
replacement = None # "Calendar Date" if anonymise_numbers else None # TODO: recognise only calendar dates, not phrases like "a big day", "today", etc
elif label == "TIME":
replacement = None # "Time"
elif label == "PERCENT":
replacement = None # "Percent"
elif label == "MONEY":
replacement = "Money Amount" if anonymise_numbers else None
elif label == "QUANTITY":
replacement = "Quantity" if anonymise_numbers else None
elif label == "ORDINAL":
replacement = None # "Ordinal"
elif label == "CARDINAL":
replacement = (
"Number" if
anonymise_numbers
and len(text_normalised) > 2 # # do not anonymise short number since they are likely ordinals too
and re.search(r"(\d|\s)", text_normalised) is not None # if it is a one-word textual representation of a number then do not normalise it. It might be phrase like "one-sided" etc, which is actually not a number
else None
)
else:
replacement = None
if phase == 1:
result += user_input[prev_ent_end:start_char]
prev_ent_end = end_char
if replacement is None:
if phase == 1:
result += text_original
else:
if phase == 0:
if text_normalised not in entities_dict:
while next_available_replacement_letter_index in reserved_replacement_letter_indexes:
next_available_replacement_letter_index += 1
replacement_letter_index = next_available_replacement_letter_index
entities_dict[text_normalised] = replacement_letter_index
reserved_replacement_letter_indexes.add(replacement_letter_index)
#/ if text_normalised not in entities_dict:
else: #/ if phase == 0:
replacement_letter_index = entities_dict[text_normalised]
if len(reserved_replacement_letter_indexes) <= len(letters):
replacement_letter = letters[replacement_letter_index]
else:
replacement_letter = str(replacement_letter_index + 1) # use numeric names if there are too many entities in input to use letters
result += replacement + " " + replacement_letter
#/ if phase == 0:
#/ if replacement is None:
#/ for word in entities.ents:
#/ for phase in range(0, 2):
result += user_input[prev_ent_end:]
return result
#/ def anonymise_uncached()
async def anonymise(config, user_input, anonymise_names, anonymise_numbers, ner_model, enable_cache = True):
# Spacy's NER is not able to see names separated by multiple spaces as a single name. Newlines in names are fortunately ok though. Tabs are ok too, though they will still be replaced in the following regex.
# Replace spaces before caching so that changes in spacing do not require cache update
user_input = re.sub(r"[^\S\r\n]+", " ", user_input) # replace all repeating whitespace which is not newline with a single space - https://stackoverflow.com/questions/3469080/match-whitespace-but-not-newlines
encrypt_cache_data = config["encrypt_cache_data"]
cache_version = 1
if encrypt_cache_data:
result = await async_cached_encrypted(cache_version if enable_cache else None, anonymise_uncached, user_input, anonymise_names, anonymise_numbers, ner_model)
else:
result = await async_cached(cache_version if enable_cache else None, anonymise_uncached, user_input, anonymise_names, anonymise_numbers, ner_model)
return result
#/ async def anonymise():
def render_highlights_uncached(user_input, expression_dicts):
with Timer("Loading Spacy HTML renderer"):
from spacy import displacy # load it only when rendering is requested, since this package loads slowly
highlights_html = displacy.render(
{
"text": user_input,
"ents": [
{
"start": entry["start_char"],
"end": entry["end_char"],
# "label": ", ".join(entry["labels"].keys())
"label": ", ".join([f"{label}: {percent:.0f}%"
if percent is not None
else label
for label, percent in entry["labels"].items()])
}
for entry
in expression_dicts
],
"title": None
},
style="ent", manual=True
)
return highlights_html
#/ def render_highlights_uncached():
async def render_highlights(config, user_input, expression_dicts, enable_cache = True):
encrypt_cache_data = config["encrypt_cache_data"]
cache_version = 2
if encrypt_cache_data:
result = await async_cached_encrypted(cache_version if enable_cache else None, render_highlights_uncached, user_input, expression_dicts)
else:
result = await async_cached(cache_version if enable_cache else None, render_highlights_uncached, user_input, expression_dicts)
return result
#/ async def render_highlights():
def parse_labels(all_labels_as_text):
labels_list = []
lines = all_labels_as_text.splitlines(keepends=False)
for line in lines:
line = line.strip()
if line[:1] == "-": # [:1] enables handling of empty lines, while [0] would throw exception
line = line[1:].strip()
line = sanitise_input(line)
line = re.sub(r"[.,:;]+", "/", line).strip() # remove punctuation from labels
if len(line) == 0:
continue
labels_list.append(line)
#/ for line in all_labels_as_text.splitlines(keepends=False):
labels_list = list(OrderedDict.fromkeys(labels_list)) # keep only unique labels, while preserving labels ordering
all_labels_as_text = "\n".join("- " + x for x in labels_list)
# labels_list.sort()
return (labels_list, all_labels_as_text)
#/ def parse_labels():
def split_text_into_chunks_worker(encoding, paragraphs, paragraph_token_counts, separator, separator_token_count, max_tokens_per_chunk, overlap_chunks_at_least_halfway = False): # TODO: overlap_chunks_at_least_halfway
chunks = []
current_chunk = [] # chunk consists of a list of paragraphs
current_chunk_token_count = 0
for paragraph, paragraph_token_count in zip(paragraphs, paragraph_token_counts):
if current_chunk_token_count > 0:
if current_chunk_token_count + separator_token_count + paragraph_token_count <= max_tokens_per_chunk:
current_chunk_token_count += separator_token_count + paragraph_token_count
current_chunk.append(separator) # TODO: keep original separators that were present in text
current_chunk.append(paragraph)
continue
else: # current chunk has become full, so lets finalise it and start a new chunk
chunks.append((current_chunk, current_chunk_token_count), )
current_chunk = []
current_chunk_token_count = 0
#/ if current_chunk_token_count > 0:
if paragraph_token_count <= max_tokens_per_chunk:
current_chunk_token_count = paragraph_token_count
current_chunk.append(paragraph)
else:
return None # max_tokens_per_chunk is too small
#/ for paragraph in paragraphs:
if current_chunk_token_count > 0:
chunks.append((current_chunk, current_chunk_token_count), )
# TODO: find a way to distribute the characters roughly evenly over chunks so that the last chunk is not smaller than the other chunks. This probably needs some combinatorial optimisation to achieve it though.
return chunks
#/ def split_text_into_chunks_worker(encoding, paragraphs, paragraph_token_counts, separator, separator_token_count, max_tokens_per_chunk, overlap_chunks_at_least_halfway = False)
def split_text_into_chunks(encoding, paragraphs, paragraph_token_counts, separator, max_tokens_per_chunk, overlap_chunks_at_least_halfway = False, balance_chunk_sizes = True): # TODO: overlap_chunks_at_least_halfway
assert(max_tokens_per_chunk > 0)
separator_tokens = encoding.encode(separator)
separator_token_count = len(separator_tokens)
chunks = split_text_into_chunks_worker(encoding, paragraphs, paragraph_token_counts, separator, separator_token_count, max_tokens_per_chunk, overlap_chunks_at_least_halfway = False)
if chunks is None: # max_tokens_per_chunk is too small
return None
if balance_chunk_sizes and len(chunks) > 1:
max_allowed_chunks = len(chunks) # do not increase the number of chunks during balancing
best_chunks = None
upper_bound = None
exclusive_lower_bound = 0
best_chunk_try_index = None # for debugging
try_count = 1 # for debugging
# using binary search for finding smallest max_tokens_per_chunk. Assuming that the split with smallest max_tokens_per_chunk always wins with regards to the balancedness metric
while True: # TODO: apply time limit to this loop
chunk_sizes_in_tokens = [chunk_token_count for (chunk_paragraphs, chunk_token_count) in chunks]
biggest = max(chunk_sizes_in_tokens)
if chunks is None or len(chunks) > max_allowed_chunks: # do not increase the number of chunks during balancing # chuks is None when some paragraph was too long for current max_tokens_per_chunk
exclusive_lower_bound = max_tokens_per_chunk
max_tokens_per_chunk = int((upper_bound + exclusive_lower_bound + 1) / 2) # round up
if max_tokens_per_chunk == upper_bound: # tried that already # upper_bound is set only when chunks is not None, therefore in this case chunks is not None here
break
else: # if chunks is None or len(chunks) > max_allowed_chunks:
best_chunks = chunks
best_chunk_try_index = try_count
upper_bound = biggest
max_tokens_per_chunk = int((upper_bound + exclusive_lower_bound) / 2) # round down
if max_tokens_per_chunk == exclusive_lower_bound: # tried that already
break
#/ if len(chunks) > max_allowed_chunks:
# retry with different chunk size limit
chunks = split_text_into_chunks_worker(encoding, paragraphs, paragraph_token_counts, separator, separator_token_count, max_tokens_per_chunk, overlap_chunks_at_least_halfway = False)
try_count += 1
#/ while True:
chunks = best_chunks
#/ if balance_chunk_sizes and len(chunks) > 1:
chunks = ["".join(chunk_paragraphs) for (chunk_paragraphs, chunk_token_count) in chunks]
return chunks
#/ def split_text_into_chunks(encoding, paragraphs, separator, max_tokens_per_chunk, overlap_chunks_at_least_halfway = False)
#def split_text_into_chunks_alternate(encoding, paragraphs, separator, max_tokens_per_chunk, overlap_chunks_at_least_halfway = False, balance_chunk_sizes = True): # TODO: overlap_chunks_at_least_halfway
# paragraph_token_counts = []
# for paragraph in paragraphs:
# paragraph_tokens = encoding.encode(paragraph)
# paragraph_token_count = len(paragraph_tokens)
# paragraph_token_counts.append(paragraph_token_count)
# separator_tokens = encoding.encode(separator)
# separator_token_count = len(separator_tokens)
# chunks = split_text_into_chunks_worker(encoding, paragraphs, paragraph_token_counts, separator, separator_token_count, max_tokens_per_chunk, overlap_chunks_at_least_halfway = False)