-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathv2_4_2_main_Text_Analyzer_Step_through_Viewer.py
executable file
·1877 lines (1628 loc) · 70.1 KB
/
v2_4_2_main_Text_Analyzer_Step_through_Viewer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
import sys
import os
import time
import random
from abc import ABCMeta, abstractmethod
"""
Step-through Text and Word Viewer
Author: Karl Toby Rosenberg
Version 2_4_2, December 2016
"Text Analyzer and Step-through Viewer" is a utility to navigate an input text
and examine the usage, repetition, and proximity of words.
The viewer can jump forwards and backwards in an input text by line or
between instances of particular words to see distances between those word
instances.
"""
# module-level constants:
PROGRAM_BANNER = ("+============================================+\n"
"| Welcome! |\n"
"+============================================+\n")
# directory of the program file
PROGRAM_HOME = os.path.dirname(os.path.realpath(__file__))
# for user input and control options in text_step and word_step functions
ENTER = ''
W_NEXT_INST = '>'
W_PREV_INST = '<'
INSTRUCTIONS = frozenset(['qa', "help"])
YES = NEXT_LINE = DEFAULT = 1
NO = QUIT = FIRST = 0
NO_MOVE = ERROR = -1
NO_OP = "NOOP"
# for accessing word_analysis list
WORD_COUNT = 0
LINE_NUMBERS = 1
ITH_WORD_IN_TEXT = 2
ITH_CHAR_ON_LINE = 3
# UNUSED
# ICHARINTEXT = 4
# ASCII values for alphabetic characters
A_LO = 65
Z_LO = 90
A_UP = 97
Z_UP = 122
#############################
def binary_min_line_above_search(line_numbers, low, high, starting_line):
"""
given a starting line number and a list of valid line numbers,
finds and returns the index of the nearest line number greater or
equal to the starting line
param:
list[int] line_numbers (list of line number candidates)
int, low (lowest index to search)
int, high (highest index to search)
int, starting_line (the line from which to start the search for
the nearest later line)
return:
int, the index of the valid line search, -1 if no such line exists
"""
index_first_valid_line = high
if line_numbers[index_first_valid_line] == starting_line:
return index_first_valid_line
while low <= high:
mid = (low + high)//2
test_line = line_numbers[mid]
if test_line == starting_line:
return mid
elif test_line < starting_line:
low = mid + 1
#if test_line > starting_line
elif (line_numbers[index_first_valid_line] >= test_line
and mid <= index_first_valid_line):
index_first_valid_line = mid
high = mid - 1
if line_numbers[index_first_valid_line] < starting_line:
return -1
return index_first_valid_line
def binary_max_line_below_search(line_numbers, low, high, starting_line):
"""
given a starting line number and a list of valid line numbers,
finds and returns the index of the nearest line number less than or
equal to the starting line
param:
list[int] line_numbers (list of line number candidates)
int, low (lowest index to search)
int, high (highest index to search)
int, starting_line (the line from which to start the search for
the nearest earlier line)
return:
int, the index of the valid line search, -1 if no such line exists
in the correct range
"""
index_first_valid_line = low
if line_numbers[index_first_valid_line] == starting_line:
return index_first_valid_line
while low <= high:
mid = (low + high)//2
test_line = line_numbers[mid]
if test_line == starting_line:
return mid
elif test_line > starting_line:
high = mid - 1
# if test_line < starting_line
elif (line_numbers[index_first_valid_line] <= test_line
and mid >= index_first_valid_line):
index_first_valid_line = mid
low = mid + 1
if line_numbers[index_first_valid_line] > starting_line:
return -1
return index_first_valid_line
def clean_word(word):
"""
returns string with
all non-alphabetical characters from given string (word) omitted
param:
string, word
return:
string, cleaned (None if nothing to clean)
"""
if word is None:
return None
cleaned = []
cmp = 0
for char in word:
cmp = ord(char)
if (cmp >= A_LO and cmp <= Z_LO) or (cmp >= A_UP and cmp <= Z_UP):
cleaned.append(char)
return ''.join(cleaned)
def is_valid_char(char, in_word_punct):
"""
checks whether a given character is alphabetical or a valid non-alphabetical character,
returns True if valid, else returns False
param:
string, char (character to check)
dictionary, in_word_punct (dictionary
return:
boolean (True if the character is alphabetical, False otherwise)
"""
val = ord(char)
if ((val >= A_LO and val <= Z_LO) or (val >= A_UP and val <= Z_UP)
or char in in_word_punct):
return True
return False
def print_step_instructions():
"""
displays the commands for text_step and word_step functions
"""
print("TEXT STEP COMMANDS:\n"
" -enter a positive number n to display the next n lines\n"
" -a negative number -n to move to a specific line number n\n"
" -the < and > character keys to skip to\n"
" the previous or next instance of a word\n"
" -qa to display the commands again\n"
" -0 to leave text step for this file\n"
"--------------------------------------------------")
def word_step(text_as_lines, word_analysis, starting_line, choice='>'):
"""
skips to instances of the chosen word within the text,
displays number of words skipped with each movement,
displays position of each word instance with respect to the "list" of all
words in the text
enter '<' or '>' to skip to the previous or next instance of the chosen word
param:
list, text_as_lines
the entire input text divided into lines,
where line i is stored in text_as_lines[i-1]
list, word_analysis
information pertaining to a specific word in the text:
word_analysis[0]
int (number of instances of the given word in the text)
word_analysis[1]
list[int] (for each instance of the given word,
stores--in order--the line numbers on which the word occurred)
word_analysis[2]
list[int]
(interpret the text as a list of words,
where word i is the ith word in the text,
this list stores the word index i for each instance of the
given word
word_analysis[3]
list[int]
(interpret the text as a list of strings where each string
is a line in the text with indices 0-length_of_line-1,
this list stores the index of the first character of the
given word for each instance of the word,
with respect to its line.
Access this list with word_analysis[1])
word_analysis = [
1,
[line_count-1],
[word_i],
[pos_on_line]
]
int, starting_line (the current line in the text)
param (opt.):
string, choice:
for now word_step is entered only from text_step when the
'<' or '>' command is entered
(to step to the previous or the next instance),
but the default choice value is now '>'
return:
(string, int) 2-tuple (command, current line number)
used so text_step line number and next command are
consistent with changes and commands in word_step
(pending command and line number)
"""
line_nums = word_analysis[LINE_NUMBERS]
word_i = word_analysis[ITH_WORD_IN_TEXT]
pos_on_line = word_analysis[ITH_CHAR_ON_LINE]
# track current line
current_line = starting_line
# track ith instance of word
w_inst_index = 0
# number of word instances
num_word_inst = len(word_i)
"""
find first instance of word at/after or at/before starting line
"""
# store result of searches (index of a desired word instance)
found = -1
# if the starting line is not the first line and
# the command is to find the next word instance
if choice == W_NEXT_INST:
if starting_line > 1:
# binary search for the index of the first valid line at or
# after starting_line
found = binary_min_line_above_search(line_nums, 0,
len(line_nums) - 1,
starting_line)
# return (0, 0) if the end of the file has been reached
# (no more instances later in the text) to exit
if found == -1:
print("Last instance reached\n---------------------")
return NO_OP, 0
else:
current_line = line_nums[0]
# if the command is to find the previous word instance
elif choice == W_PREV_INST:
if starting_line > 1:
# binary search for the index of the first valid line
# at or below starting_line
found = binary_max_line_below_search(line_nums, 0,
len(line_nums) - 1,
starting_line)
# if no earlier word instance is found,
# move to the first one in the text
if found == -1 or current_line < line_nums[found]:
print("No instance earlier, starting at first instance\n")
current_line = line_nums[0]
else:
print("No instance earlier, starting at first instance\n")
current_line = line_nums[0]
# set the current word instance index and
# set the current line to be the instance's line
if found >= 0:
# set the word and line start positions to
# the beginning of the line containing the word instance
w_inst_index = found
current_line = line_nums[w_inst_index]
################
# True if the latest command is valid
legal_command = True
# command
choice = ''
# exit from the loop when an attempt is made
# to move to move beyond the final instance of the word
# (considered the end of the text in word_step)
while w_inst_index < num_word_inst:
# print the current line
print(text_as_lines[current_line-1], end='')
# display the marker for the current instance of the word,
# display the number of words between current and previous
# instances of the word
if legal_command:
# display the word marker (preceded by proper number of spaces)
# under the current text line
print('{0:>{1:d}}{2:d}'.format('^- w',
pos_on_line[w_inst_index]+4,
word_i[w_inst_index]))
# display the number of words between the current word instance and
# the previous word instance reached
if choice == W_NEXT_INST:
print('{0}{1:d}'.format("words skipped forwards: ",
(word_i[w_inst_index]
- word_i[w_inst_index-1] - 1)))
elif choice == W_PREV_INST:
print('{0}{1:d}'.format("words skipped backwards: ",
(word_i[w_inst_index+1]
- word_i[w_inst_index] - 1)))
legal_command = True
# display current line number
choice = input("L" + str(current_line) + ">> ").strip()
print()
"""
CHECK COMMANDS
"""
# move to next word instance
if choice == W_NEXT_INST:
# if the next word instance index equals
# the number of word instances in the text,
# then the end of the text has been reached, no-op
if w_inst_index + 1 == num_word_inst:
print("Last instance reached\n---------------------")
# no-op command
choice = NO_MOVE
else:
#increment the word instance index
w_inst_index += 1
# move to the next line
current_line = line_nums[w_inst_index]
# move to previous word instance
elif choice == W_PREV_INST:
# if not at the first instance of the word,
# decrement the word instance index
if w_inst_index == 0:
# otherwise if the first word instance has already been reached,
# reset the word index and line start positions to
# the beginning of the current line
print("First instance reached\n----------------------")
# no-op command
choice = NO_MOVE
else:
w_inst_index -= 1
# move to the next line
current_line = line_nums[w_inst_index]
# enter, exit word_step and proceed to the next line
elif choice == ENTER:
# return a step of 1 (move to next line) and the current line number
return "1", current_line
# display instructions
elif choice in INSTRUCTIONS:
print_step_instructions()
else:
# if the command is a valid integer,
# return a step of int(choice), print (choice) lines
try:
return str(int(choice)), current_line
# if exception, the command is illegal,
# continue and prompt for input again
except:
legal_command = False
print("INVALID command")
continue
def text_step(text_as_lines, word_analysis=None):
"""
step-through lines in the text,
enter a positive number n to display and step forward by n lines
enter a negative number -n to skip to line number |-n|
enter '<' or '>' to skip to the previous or
next instance of the chosen word (see word_step() )
(whose word_analysis list is passed to text_step() )
enter "qa" to display the instructions
enter 0 to exit
param:
list, text_as_lines
the entire input text divided into lines,
where line i is stored in text_as_lines[i-1]
param (opt.):(can only move between word instances with a word_analysis)
list, word_analysis
information pertaining to a specific word in the text:
word_analysis[0]:
int (number of instances of the given word in the text)
word_analysis[1]
list[int]
(for each instance of the given word, stores--in order--the
line numbers on which the word occurred)
word_analysis[2]
list[int]
(interpret the text as a list of words,
where word i is the ith word in the text,
this list stores the word index i for each instance of
the given word)
word_analysis[3]
list[int]
(interpret the text as a list of strings where each string
is a line in the text with indices 0-length_of_line-1,
this list stores the index of the first character of
the given word for each instance of the word,
with respect to its line.
Access this list with word_analysis[1])
word_analysis = [
1,
[line_count-1],
[word_i],
[pos_on_line]
]
return:
0 (QUIT) upon success,
-1 (ERROR) upon an error e.g. in word step
"""
#################################
if text_as_lines is None:
return ERROR
total_lines = len(text_as_lines)
# lines displayed in a row
cur_step = 0
# maximum number of steps in a row / alternate command option
step = 1
# line position in text file
line_pos = 0
word_step_is_on = True
if word_analysis is None:
word_step_is_on = False
else:
line_nums = word_analysis[1]
w_inst = word_analysis[2]
pos_on_line = word_analysis[3]
# current line number (displayed)
current_line_l = 0
# display the instructions upon first call of function
if text_step.first_time:
print_step_instructions()
text_step.first_time = False
# accept commands until the end of the text has been reached
while current_line_l < total_lines:
# print the current line
print(text_as_lines[current_line_l], end='')
# increment the number of lines that have been displayed in a row
cur_step += 1
# increment the line number
current_line_l +=1
# print the next line if there are more lines to display in a row
if cur_step < step:
continue
# otherwise CHECK COMMANDS
else:
# wrap the command prompt and associated checks with a try/except
# block to handle illegal commands
while True:
try:
# display the current line number,
# prompt for the next command
step = input("L" + str(current_line_l) + ">> ").strip()
# reset the lines-displayed-in-a-row counter
cur_step = 0
# move to the next or previous instance of a word
if step == W_NEXT_INST or step == W_PREV_INST:
if not word_step_is_on:
print("No word specified\n")
continue
########## with testing enabled,
# can enter and exit with return value printouts
try:
# call word_step to handle movement to instances of
# specific words,
# returns a tuple (command, line_number)
# so text_step can update the current line
# and try the next command
control = word_step(text_as_lines, word_analysis,
current_line_l, step)
if control[0] == NO_OP:
continue
current_line_l = control[1]
# print("EXITING WORD_STEP with current_line = ", current_line_l, " return value = ", step)
except Exception as e:
# print(e)
print("CRITICAL ERROR, WORD STEP FAILED")
return ERROR
##########
step = control[0]
# enter, move to the next line and print it
elif step == ENTER:
step = 1
break
# display the instructions
elif step in INSTRUCTIONS:
print_step_instructions()
continue
# otherwise interpret the command as an integer
# check if valid int, causes an error if not
step_as_int = int(step)
# if the command is a negative number,
# interpret it as a command to jump to a
# specific line number abs(step)
if step[0] == "-":
current_line_l = int(step[1:])-1
step = 1
break
# if the command is a positive number,
# interpret it as the number of lines
# to print in succession
elif step_as_int > 0:
step = step_as_int
break
# if the command is 0, quit with a return value of 0
else:
return QUIT
# upon an exception / if command unrecognized,
# loop around and prompt for a new command
except Exception as e2:
# print(e2)
print("INVALID command")
continue
# before returning from the function,
# display the final line number if the end of the final has been reached
print('\nEnd of file reached after L{0:d}\n'.format(current_line_l))
return QUIT
# function attribute,
# True if function call is the first one of the current session
text_step.first_time = True
def calc_word_analysis(text_file,
in_word_punct=frozenset(["'", '-', u"’"]),
eq_words={"can't":["can", "not"],
"cannot":["can", "not"],
"won't":["will", "not"],
"shouldn't":["should", "not"],
"you'd":["you", "would"],
"you'll":["you", "will"],
"you're":["you", "are"]
},
save_sequence=False
):
"""
calculates word frequencies given a text string,
can find additional (optional) information, ignore trivial words,
ignore words above a certain length,
other possibilities are a work-in-progress
param:
file, text_file (the file object representing the chosen text)
param (opt.):
frozenset[string], in_word_punct
(set of punctuation and marks used as part of words)
dictionary, eq_words (dictionary of words to consider as
other words or combinations of words)
NOTE: Currently unused
boolean, save_sequence (whether to save a list of references to
all words, duplicates included,
in the order they are seen in the input text)
return:
dictionary analysis_dict[string:[...]]
(of word_analysis dictionary and optional dictionaries)
list, word_analysis (access with analysis_dict["word analysis"])
information pertaining to a specific word in the text,
access with analysis_dict["word analysis"]
word_analysis[0] (word_analysis[WORD_COUNT])
int (number of instances of the given word in the text)
word_analysis[1] (word_analysis[LINE_NUMBERS])
list of int (for each instance of the given word,
stores--in order--the line numbers where the word exists)
word_analysis[2] (word_analysis[ITH_WORD_IN_TEXT])
list of int (understand the entire text as a list of words,
where word i is the ith word in the text,
this list stores the word index i for each instance of
the given word)
word_analysis[3] (word_analysis[ITH_CHAR_ON_LINE])
list of int (understand the entire text as a list of strings
where each string is a line in the text with indices
0-length_of_line-1,
this list stores the index of the first character of
the given word for each instance of the word,
with respect to its line.
word_analysis [
1,
[line_count-1],
[word_i],
[pos_on_line]
]
UNUSED/UNCALCULATED (May reuse later):
word_analysis[4]:
list of int (interpret the entire text as a single
string with indices 0-length_of_text-1,
this list stores the index of the first character of
the given word for each instance of the word)
list[int] text_as_lines (access with analysis_dict["text as lines"])
the entire input text divided into lines,
where line i is stored in text_as_lines[i-1]
list[string] word list (access with analysis_dict["word list"])
access list of words with analysis_dict[1]
list[string] (the sequence of words, duplicates included, that
appear in the input text, access with
analysis_dict["word sequence"])
int, number of words (access with analysis_dict["total words"])
Temporarily removed / work-in-progress options
(NOTE: will redo outside function):
gender: (access with analysis_dict["gender stat"]
access counts with [m] and [f]
access percentages with [%_m] and [%_f]
access percent of words identifiable as
masculine or feminine with [%_indentifiable]
mood: (access with analysis_dict["mood stat"])
access counts with [:D] and [D:]
access percentages with [%_:D] and [%_D:]
access percent of words identifiable as happy or sad
with [%_indentifiable]
"""
if text_file is None or in_word_punct is None:
return None
# dictionary of lists and dictionaries to return
analysis_dict = {}
# word analysis dictionary of word count and lists
# (variables declared at top of file simplify access for user)
word_analysis = {}
# word list
word_list = []
if save_sequence:
# word sequence (duplicates allowed)
word_seq = []
# save reference to word_seq.append
word_seq_append_ = word_seq.append
# dictionary of gender word counts (-1 counts if unused)
# gender_stat = {'m':-1, 'f':-1}
# dictionary of mood stat counts (-1 counts if unused)
# mood_stat = {':D':-1, 'D:':-1}
# save reference to word_list.append
word_list_append_ = word_list.append
# save reference to str.lower()
lower_ = str.lower
# save reference to str.isalpha()
isalpha_ = str.isalpha
# create a new list to store each character to be combined into a word
new_word = []
# save reference to new_word.append
new_word_append_ = new_word.append
#for each line L store the line at index L in text_as_lines
text_as_lines = []
text_as_lines_append_ = text_as_lines.append
# given text, create a word frequency dictionary of words in
# all_text stripped of invalid punctuation,
# records word positions, line positions, number of words between
# instances of a given word
# for use with text_step and word_step
###########################################################
# track the number of characters reached so far
# with respect to the current line
char_count_line = -1
# UNUSED
# track the number of characters reached so far
# with respect to the whole text
# char_count_text = -1
# counter tracks whether multiple punctuation marks appear in a row,
# used to allow for words with "inside punctuation"
# (e.g. good-natured has a hyphen)
# but does not allow words with multiple punctuation or non-alphabetical
# characters in a row
double_punct = 0
# marks a word as alphabetical
has_alpha = False
# save a puffer of punctuation marks to allow for in-word punctuation
# without adding punctuation immediately after the word
punct_buffer = []
# save reference to punct_buffer.append
punct_buffer_append_ = punct_buffer.append
# count the line number according to '\n' characters in text
line_count = 1
# count the number of words found
word_i = 0
# word start index with respect to lines
pos_on_line = 0
# UNUSED
# word start index with respect to text
# pos_in_text = 0
# read the first line
line = text_file.readline()
# iterate as long as another line exists in the text
while line:
# store the line in the text
text_as_lines_append_(line)
# iterate through each character in the input text
for char in line:
char_count_line += 1
# UNUSED
# char_count_text += 1
# if char is new-line,
if char == '\n':
# reset the number of characters reached
# with respect to the line
char_count_line = -1
# increment the line count
line_count += 1
# proceed immediately to the next character
# if the char is not alphabetic, continue to the next character
# or if the current word under
# construction has no alphabetic characters
# (words must begin with an alphabetic character)
if not has_alpha and not isalpha_(char):
continue
# treat alphabetic characters
if isalpha_(char):
# if the current word under construction
# has no alphabetical characters so far (is empty),
# mark the starting position of the word,
# mark the word as alphabetic
if not has_alpha:
pos_on_line = char_count_line
# UNUSED
# pos_in_text = char_count_text
has_alpha = True
# if characters are waiting in the punctuation buffer,
# first append them to the word under construction,
# then clear the buffer
if len(punct_buffer) > 0:
new_word_append_(''.join(punct_buffer))
del punct_buffer[:]
# append the current alphabetic character
# to the word under construction
new_word_append_(lower_(char))
# reset the punctuation-in-a-row counter to 0
# since the alphabetic character ends the streak
double_punct = 0
#treat valid punctuation/characters
elif char in in_word_punct:
# if the punctuation-in-a-row counter is 0,
# append the current punctuation/valid non-alphabetic mark
# to the punctuation buffer
# and increment the punctuation-in-a-row counter
# -punctuation is not added immediately in case, for example,
# the current character is a hyphen,
# which can be safely added in the middle of a word,
# but cannot be added at the end of one.
# The hyphen is not added to the end of a word,
# as the word is considered complete before it can be
# (incorrectly) added.
if double_punct == 0:
punct_buffer_append_(char)
double_punct += 1
# the current word has been completed if:
# the punctuation-in-a-row counter is set to 2
# (words cannot have multiple punctuation marks in a row)
# or the character is not alphabetic or
# an otherwise valid punctuation mark or character
if double_punct == 2 or not is_valid_char(char, in_word_punct):
# clear the punctuation buffer
del punct_buffer[:]
# reset the punctuation-in-a-row count
double_punct = 0
# reset has_alpha to prepare
# for the next round of valid word-checking
has_alpha = False
# (an additional check) to
# make sure that the new word has a valid length
if len(new_word) > 0:
# a new word has been completed, increment the word counter
word_i += 1
# saved the characters in new_word as a joined_word
joined_word = sys.intern(''.join(new_word))
# if the new word has not been added to the dictionary
# and the word is alphabetical,
# add an entry for the word in the word list
# and in the dictionary with a count of 1
if joined_word not in word_analysis:
# WORD ANALYSIS CONTENTS:
# - integer representing the total word count
# for the given word,
# - list of line numbers on which the word appears
# - list of the positions of each instance of the word
# with respect to the list of words in the entire text
# - list of the positions of the first char
# for each instance of the word
# with respect to the entire text,
# - list of the positions of the first char
# for each instance of the word
# with respect to the current line in the text
# add an entry for the joined_word
if char == '\n':
# if the current character is a new-line character,
# the line-count is off by +1
word_analysis[joined_word] = [
1,
[line_count-1],
[word_i],
[pos_on_line]#,
#[pos_in_text]
]
else:
word_analysis[joined_word] = [
1,
[line_count],
[word_i],
[pos_on_line]#,
#[pos_in_text]
]
# add new word to word list
word_list_append_(joined_word)
# else if the new word has already been
# added to the dictionary,
# increment the frequency count
# and add or update other information for that word
else:
# access the in-progress word data
word_data = word_analysis[joined_word]
# increment the word frequency count
word_data[WORD_COUNT] += 1
# append the next valid line number
if char == '\n':
word_data[LINE_NUMBERS].append(line_count-1)
else:
word_data[LINE_NUMBERS].append(line_count)
# append the ith word value for the
# current instance of the word
word_data[ITH_WORD_IN_TEXT].append(word_i)
# append the starting position/index of the
# current word instance with respect to the current line
word_data[ITH_CHAR_ON_LINE].append(pos_on_line)
# UNUSED
# append the starting position/index of the
# current word instance with respect to the whole text
# word_data[ICHARINTEXT].append(pos_in_text)
if save_sequence:
word_seq_append_(joined_word)
# reset the word string
del new_word[:]
# try to read the next line
line = text_file.readline()
# The following checks whether there are any trailing characters
# words are missed if the input file does not have an ending new-line
# Rather than add an extra conditional in the main loop,
# I add duplicated code (it's a trade-off)
if len(new_word) > 0:
# a new word has been completed, increment the word counter
word_i += 1
# saved the characters in new_word as a joined_word
joined_word = sys.intern(''.join(new_word))
# if the new word has not been added to the dictionary
# and the word is alphabetical,
# add an entry for the word in the word list
# and in the dictionary with a count of 1
if joined_word not in word_analysis:
# WORD ANALYSIS CONTENTS:
# - integer representing the total word count
# for the given word,
# - list of line numbers on which the word appears
# - list of the positions of each instance of the word
# with respect to the list of words in the entire text
# - list of the positions of the first char
# for each instance of the word
# with respect to the entire text,
# - list of the positions of the first char
# for each instance of the word
# with respect to the current line in the text
# add an entry for the joined_word
if char == '\n':
# if the current character is a new-line character,
# the line-count is off by +1
word_analysis[joined_word] = [
1,
[line_count-1],
[word_i],
[pos_on_line]#,
#[pos_in_text]
]
else:
word_analysis[joined_word] = [
1,
[line_count],
[word_i],
[pos_on_line]#,
#[pos_in_text]
]
# add new word to word list
word_list_append_(joined_word)
# else if the new word has already been added to the dictionary,
# increment the frequency count and other information for that word
else:
# access the in-progress word data
word_data = word_analysis[joined_word]
# increment the word frequency count
word_data[WORD_COUNT] += 1
# append the next valid line number
if char == '\n':
word_data[LINE_NUMBERS].append(line_count-1)
else:
word_data[LINE_NUMBERS].append(line_count)
# append the ith word value for the current instance of the word
word_data[ITH_WORD_IN_TEXT].append(word_i)
# append the starting position/index of the current word instance
# with respect to the current line
word_data[ITH_CHAR_ON_LINE].append(pos_on_line)
if save_sequence:
word_seq_append_(joined_word)
# if the text does not end with a new-line character
# append a guard new-line character
if len(text_as_lines) > 0:
final_line_index = len(text_as_lines) - 1
len_final_line = len(text_as_lines[final_line_index])
if len_final_line > 0:
text_as_lines[final_line_index] += '\n'
else:
text_as_lines[final_line_index] = '\n'