-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
3,650 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
# logging_config.py | ||
|
||
""" | ||
logging_config.py | ||
Defines logging configurations for the application. | ||
Sets log levels, formats, handlers, and destinations (e.g., file, console). | ||
This module provides a function `setup_logging()` to configure logging | ||
for the entire application. | ||
It is designed to be imported and called at the entry point of the application | ||
before any logging is performed. | ||
Example usage: | ||
from logging_config import setup_logging | ||
setup_logging() | ||
""" | ||
|
||
import logging | ||
import logging.config | ||
import os | ||
import sys | ||
from logging.handlers import RotatingFileHandler | ||
from datetime import datetime | ||
|
||
class LoggingConfigurationError(Exception): | ||
""" | ||
Custom exception raised when an error occurs during logging configuration. | ||
""" | ||
pass | ||
|
||
def setup_logging( | ||
log_file_path: str = None, | ||
log_level: int = logging.INFO, | ||
console_log_level: int = logging.INFO, | ||
log_dir: str = "logs", | ||
max_log_file_size: int = 10 * 1024 * 1024, # 10 MB | ||
backup_count: int = 5, | ||
): | ||
""" | ||
Configures logging for the application. | ||
Parameters: | ||
log_file_path (str): | ||
The file path to write log files. | ||
If not specified, defaults to 'logs/app.log'. | ||
log_level (int): | ||
The logging level for file logs. | ||
Defaults to logging.INFO. | ||
console_log_level (int): | ||
The logging level for console logs. | ||
Defaults to logging.INFO. | ||
log_dir (str): | ||
The directory where log files will be stored. | ||
Defaults to 'logs'. | ||
max_log_file_size (int): | ||
The maximum size of a log file in bytes before it is rotated. | ||
Defaults to 10 MB. | ||
backup_count (int): | ||
The number of backup log files to keep when rotating. | ||
Defaults to 5. | ||
Raises: | ||
LoggingConfigurationError: | ||
If an error occurs during logging setup. | ||
Dependencies: | ||
- Standard logging library. | ||
- os, sys, datetime modules. | ||
Example: | ||
setup_logging() | ||
""" | ||
|
||
try: | ||
# Create log directory if it doesn't exist | ||
if not os.path.exists(log_dir): | ||
os.makedirs(log_dir) | ||
|
||
# Set default log file path if not provided | ||
if log_file_path is None: | ||
log_file_path = os.path.join(log_dir, 'app.log') | ||
|
||
# Define log formatters | ||
standard_formatter = logging.Formatter( | ||
fmt='%(asctime)s [%(levelname)s] %(name)s: %(message)s', | ||
datefmt='%Y-%m-%d %H:%M:%S' | ||
) | ||
|
||
simple_formatter = logging.Formatter( | ||
fmt='[%(levelname)s] %(message)s' | ||
) | ||
|
||
# Create console handler with specified log level | ||
console_handler = logging.StreamHandler(sys.stdout) | ||
console_handler.setLevel(console_log_level) | ||
console_handler.setFormatter(simple_formatter) | ||
|
||
# Create rotating file handler | ||
file_handler = RotatingFileHandler( | ||
filename=log_file_path, | ||
mode='a', | ||
maxBytes=max_log_file_size, | ||
backupCount=backup_count, | ||
encoding='utf-8', | ||
delay=0 | ||
) | ||
file_handler.setLevel(log_level) | ||
file_handler.setFormatter(standard_formatter) | ||
|
||
# Get the root logger | ||
logger = logging.getLogger() | ||
logger.setLevel(logging.DEBUG) # Capture all levels; handlers will filter | ||
|
||
# Remove any existing handlers | ||
logger.handlers = [] | ||
|
||
# Add handlers to the root logger | ||
logger.addHandler(console_handler) | ||
logger.addHandler(file_handler) | ||
|
||
# Optionally, configure module-specific loggers | ||
# For example, set different log levels for different modules | ||
# logger_module = logging.getLogger('table2json_extractor') | ||
# logger_module.setLevel(logging.DEBUG) | ||
|
||
# Silence noisy loggers from third-party libraries if necessary | ||
# logging.getLogger('some_noisy_library').setLevel(logging.WARNING) | ||
|
||
except Exception as e: | ||
# Raise a custom exception if logging setup fails | ||
raise LoggingConfigurationError(f"Failed to configure logging: {e}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
# streamlit_app/app/pages/Table2Json_Extractor.py | ||
|
||
""" | ||
Table2Json_Extractor.py | ||
Streamlit page handling user interactions and displaying results for the Table to JSON Extractor app. | ||
This module provides the user interface for uploading documents, specifying extraction parameters, | ||
processing the documents, and displaying the extracted tables in JSON format. | ||
It relies on other modules for processing, including: | ||
- data_processing.py | ||
- extraction_parameters.py | ||
- structure_interpretation.py | ||
- user_interface.py | ||
- validation.py | ||
- logging_handlers.py | ||
""" | ||
|
||
import streamlit as st | ||
import logging | ||
|
||
# Import necessary modules and functions | ||
from data_processing import parse_documents | ||
from extraction_parameters import ( | ||
ExtractionParameters, | ||
TableSelectionCriteria, | ||
FormattingRules, | ||
ErrorHandlingStrategy, | ||
ParserConfiguration, | ||
ResourceLimits, | ||
) | ||
from structure_interpretation import interpret_table_structure | ||
from user_interface import process_user_input, process_documents, render_results | ||
from validation import validate_user_inputs, validate_extracted_data | ||
from locale_manager import load_locale | ||
from exceptions import ( | ||
InvalidUserInputError, | ||
DataValidationError, | ||
ProcessingError, | ||
RenderingError, | ||
InvalidParameterError, | ||
) | ||
|
||
# Initialize logging | ||
logger = logging.getLogger(__name__) | ||
|
||
# Set up logging configurations (assuming setup_logging from logging_config.py has been called in main.py) | ||
|
||
# Load locale (for internationalization) | ||
locale = load_locale('en') # For simplicity, using English locale | ||
|
||
def main(): | ||
st.set_page_config(page_title="Table to JSON Extractor", layout="wide") | ||
|
||
st.title("Table to JSON Extractor") | ||
st.write("Upload your MS Word or PDF documents containing tables, specify your extraction parameters, and extract the tables in JSON format.") | ||
|
||
# File uploader | ||
uploaded_files = st.file_uploader( | ||
"Choose MS Word or PDF files", | ||
type=['pdf', 'docx', 'doc'], | ||
accept_multiple_files=True | ||
) | ||
|
||
# Display uploaded files | ||
if uploaded_files: | ||
st.write("Uploaded files:") | ||
for file in uploaded_files: | ||
st.write(f"- {file.name}") | ||
|
||
# Extraction parameters | ||
st.header("Extraction Parameters") | ||
|
||
# Table Selection Criteria | ||
st.subheader("Table Selection Criteria") | ||
|
||
selection_method = st.selectbox( | ||
"Select Table Selection Method", | ||
['Indexing', 'Keyword', 'Regex', 'Criteria'] | ||
) | ||
|
||
indices = None | ||
keywords = None | ||
regex_patterns = None | ||
row_conditions = None | ||
column_conditions = None | ||
|
||
if selection_method == 'Indexing': | ||
indices_input = st.text_input("Enter Table Indices (comma-separated)", value="1") | ||
try: | ||
indices = [int(i.strip()) - 1 for i in indices_input.split(',')] # Adjusted to zero-based index | ||
except ValueError: | ||
st.error("Indices must be integers separated by commas.") | ||
st.stop() | ||
elif selection_method == 'Keyword': | ||
keywords_input = st.text_input("Enter Keywords (comma-separated)") | ||
keywords = [k.strip() for k in keywords_input.split(',') if k.strip()] | ||
elif selection_method == 'Regex': | ||
regex_input = st.text_input("Enter Regular Expressions (comma-separated)") | ||
regex_patterns = [r.strip() for r in regex_input.split(',') if r.strip()] | ||
elif selection_method == 'Criteria': | ||
st.info("Criteria-based selection is not implemented in this interface yet.") | ||
st.stop() | ||
|
||
# Formatting Rules | ||
st.subheader("Formatting Rules") | ||
|
||
preserve_styles = st.checkbox("Preserve Text Styles (e.g., bold, italic)") | ||
date_format = st.text_input("Date Format", value="%Y-%m-%d") | ||
number_format = st.text_input("Number Format (e.g., {:.2f} for two decimal places)", value="") | ||
encoding = st.text_input("Text Encoding", value="utf-8") | ||
placeholder_for_missing = st.text_input("Placeholder for Missing Data", value="") | ||
|
||
# Error Handling Strategy | ||
st.subheader("Error Handling Strategy") | ||
|
||
on_parsing_error = st.selectbox( | ||
"On Parsing Error", | ||
['Skip', 'Abort', 'Log'], | ||
index=2 # Default to 'Log' | ||
) | ||
on_validation_error = st.selectbox( | ||
"On Validation Error", | ||
['Correct', 'Omit', 'Prompt', 'Abort'], | ||
index=1 # Default to 'Omit' | ||
) | ||
|
||
# Parser Configuration | ||
st.subheader("Parser Configuration") | ||
|
||
ocr_enabled = st.checkbox("Enable OCR for Scanned PDFs") | ||
language = st.text_input("Document Language", value="en") | ||
|
||
# Resource Limits | ||
st.subheader("Resource Limits") | ||
|
||
max_memory = st.number_input("Max Memory (MB)", min_value=0, value=0) | ||
max_time = st.number_input("Max Time (seconds)", min_value=0, value=0) | ||
max_cpu_usage = st.number_input("Max CPU Usage (%)", min_value=0, max_value=100, value=0) | ||
|
||
# Process Button | ||
if st.button("Process Documents"): | ||
if not uploaded_files: | ||
st.error("Please upload at least one document.") | ||
st.stop() | ||
|
||
# Collect user inputs | ||
user_inputs = { | ||
'table_selection': { | ||
'method': selection_method.lower(), | ||
'indices': indices, | ||
'keywords': keywords, | ||
'regex_patterns': regex_patterns, | ||
'row_conditions': row_conditions, | ||
'column_conditions': column_conditions | ||
}, | ||
'formatting_rules': { | ||
'preserve_styles': preserve_styles, | ||
'date_format': date_format, | ||
'number_format': number_format if number_format else None, | ||
'encoding': encoding, | ||
'placeholder_for_missing': placeholder_for_missing if placeholder_for_missing else None | ||
}, | ||
'error_handling': { | ||
'on_parsing_error': on_parsing_error.lower(), | ||
'on_validation_error': on_validation_error.lower(), | ||
'fallback_mechanisms': [] # Placeholder | ||
}, | ||
'parser_config': { | ||
'ocr_enabled': ocr_enabled, | ||
'language': language, | ||
'resource_limits': { | ||
'max_memory': int(max_memory) if max_memory > 0 else None, | ||
'max_time': int(max_time) if max_time > 0 else None, | ||
'max_cpu_usage': int(max_cpu_usage) if max_cpu_usage > 0 else None | ||
} | ||
}, | ||
'data_types': {} # Could be expanded to accept user inputs | ||
} | ||
|
||
# Process user inputs | ||
try: | ||
extraction_parameters = process_user_input(user_inputs) | ||
except InvalidUserInputError as e: | ||
st.error(f"Invalid user input: {e}") | ||
logger.error(f"Invalid user input: {e}") | ||
st.stop() | ||
|
||
# Convert uploaded_files into file paths | ||
import tempfile | ||
import shutil | ||
|
||
file_paths = [] | ||
temp_dirs = [] | ||
|
||
try: | ||
for uploaded_file in uploaded_files: | ||
# Save uploaded file to a temporary directory | ||
temp_dir = tempfile.mkdtemp() | ||
temp_dirs.append(temp_dir) | ||
file_name = uploaded_file.name | ||
temp_file_path = os.path.join(temp_dir, file_name) | ||
|
||
with open(temp_file_path, 'wb') as f: | ||
f.write(uploaded_file.read()) | ||
|
||
file_paths.append(temp_file_path) | ||
logger.debug(f"Saved uploaded file '{uploaded_file.name}' to temporary file '{temp_file_path}'.") | ||
|
||
# Process the documents | ||
try: | ||
extracted_data = process_documents(file_paths, extraction_parameters) | ||
except ProcessingError as e: | ||
st.error(f"Error processing documents: {e}") | ||
logger.error(f"Error processing documents: {e}") | ||
st.stop() | ||
|
||
# Render results | ||
try: | ||
output_format = 'json' # Currently supporting JSON output | ||
rendered_data = render_results(extracted_data, output_format) | ||
st.success("Documents processed successfully!") | ||
st.subheader("Extracted Tables in JSON Format") | ||
st.code(rendered_data, language='json') | ||
|
||
# Provide a download button | ||
st.download_button( | ||
label="Download JSON", | ||
data=rendered_data, | ||
file_name='extracted_tables.json', | ||
mime='application/json' | ||
) | ||
|
||
except RenderingError as e: | ||
st.error(f"Error rendering results: {e}") | ||
logger.error(f"Error rendering results: {e}") | ||
st.stop() | ||
|
||
finally: | ||
# Clean up temporary files and directories | ||
for temp_dir in temp_dirs: | ||
try: | ||
shutil.rmtree(temp_dir) | ||
logger.debug(f"Deleted temporary directory '{temp_dir}'.") | ||
except OSError as e: | ||
logger.warning(f"Error deleting temporary directory '{temp_dir}': {e}") | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.