-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_tiktoken.py
160 lines (135 loc) · 5.76 KB
/
test_tiktoken.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import sys
import tiktoken
from docx import Document
import PyPDF2
from transformers import pipeline
import torch
def read_text_file(file_path):
"""Reads a .txt file and returns its content."""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except UnicodeDecodeError:
with open(file_path, 'r', encoding='latin-1') as file:
return file.read()
def read_docx_file(file_path):
"""Reads a .docx file and returns its content."""
doc = Document(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
def read_pdf_file(file_path):
"""Reads a .pdf file and returns its content."""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
full_text = []
for page_num, page in enumerate(reader.pages):
text = page.extract_text()
if text:
full_text.append(text)
else:
print(f"Warning: No text found on page {page_num + 1}.")
return '\n'.join(full_text)
def tokenize_text(text, encoding='cl100k_base'):
"""Tokenizes the input text using tiktoken."""
enc = tiktoken.get_encoding(encoding)
tokens = enc.encode(text)
return tokens, enc
def detokenize_text(tokens, encoding='cl100k_base'):
"""Converts tokens back to string using tiktoken."""
enc = tiktoken.get_encoding(encoding)
text = enc.decode(tokens)
return text
def summarize_text(text):
"""Summarizes the input text using Hugging Face's transformers."""
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1 # -1 means CPU
summarizer = pipeline("summarization", device=device)
# Hugging Face models have maximum input lengths; split text if necessary
max_chunk = 1000 # Adjust based on the model's capacity
chunks = [text[i:i + max_chunk] for i in range(0, len(text), max_chunk)]
summaries = []
for chunk in chunks:
summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
summaries.append(summary[0]['summary_text'])
return ' '.join(summaries)
def get_word_count(text):
"""Returns the number of words in the text."""
return len(text.split())
def get_char_count(text):
"""Returns the number of characters in the text."""
return len(text)
def print_comparison_summary(original_text, decoded_text, tokens):
"""Prints a detailed comparison summary of the original and decoded text."""
print("\n=== TOKENIZATION SUMMARY ===")
print(f"Original Text Statistics:")
print(f"- Word Count: {get_word_count(original_text)}")
print(f"- Character Count: {get_char_count(original_text)}")
print(f"\nDecoded Text Statistics:")
print(f"- Word Count: {get_word_count(decoded_text)}")
print(f"- Character Count: {get_char_count(decoded_text)}")
print(f"\nToken Statistics:")
print(f"- Total Tokens: {len(tokens)}")
# Verify text matching
texts_match = original_text == decoded_text
print(f"\nText Verification:")
print(f"- Exact Match: {'✓ YES' if texts_match else '✗ NO'}")
if not texts_match:
# Find where the texts start to differ
min_len = min(len(original_text), len(decoded_text))
for i in range(min_len):
if original_text[i] != decoded_text[i]:
print(f"- First difference at position {i}")
print(f" Original: '{original_text[i:i+50]}...'")
print(f" Decoded: '{decoded_text[i:i+50]}...'")
break
# Compare lengths if different
if len(original_text) != len(decoded_text):
print(f"- Length mismatch: Original ({len(original_text)}) vs Decoded ({len(decoded_text)})")
def main():
try:
file_path = input('Enter the path to your file (e.g., c:\\temp\\file.txt): ').strip('"')
if not os.path.isfile(file_path):
print("The file does not exist. Please check the path and try again.")
sys.exit(1)
_, file_extension = os.path.splitext(file_path)
file_extension = file_extension.lower()
# Read the file based on its extension
if file_extension == '.txt':
text = read_text_file(file_path)
elif file_extension in ['.docx', '.doc']:
text = read_docx_file(file_path)
elif file_extension == '.pdf':
text = read_pdf_file(file_path)
else:
print(f"Unsupported file type: {file_extension}")
sys.exit(1)
print("\n--- Original Text Preview ---")
print(text[:1000] + ('...' if len(text) > 1000 else ''))
# Tokenize the text
tokens, encoder = tokenize_text(text)
# Convert tokens back to string
reconstructed_text = detokenize_text(tokens)
print("\n--- Reconstructed Text Preview ---")
print(reconstructed_text[:1000] + ('...' if len(reconstructed_text) > 1000 else ''))
# Print detailed comparison summary
print_comparison_summary(text, reconstructed_text, tokens)
# Generate content summary if text is long enough
if len(text.split()) > 100: # Only summarize if there's enough content
print("\nGenerating content summary...")
try:
summary = summarize_text(text)
print("\n--- Content Summary ---")
print(summary)
except Exception as e:
print(f"\nUnable to generate content summary: {str(e)}")
else:
print("\nText too short for meaningful summarization")
except Exception as e:
print(f"An error occurred: {e}")
import traceback
print(traceback.format_exc())
if __name__ == '__main__':
main()