Initialize Table2Json app

GSA · Jan 22, 2025 · 9a3a71f · 9a3a71f
1 parent 4fbbdba
commit 9a3a71f
Show file tree

Hide file tree

Showing 17 changed files with 3,650 additions and 30 deletions.
diff --git a/streamlit_app/app/logging_config.py b/streamlit_app/app/logging_config.py
@@ -0,0 +1,133 @@
+# logging_config.py
+
+"""
+logging_config.py
+
+Defines logging configurations for the application.
+Sets log levels, formats, handlers, and destinations (e.g., file, console).
+
+This module provides a function `setup_logging()` to configure logging
+for the entire application.
+
+It is designed to be imported and called at the entry point of the application
+before any logging is performed.
+
+Example usage:
+    from logging_config import setup_logging
+    setup_logging()
+"""
+
+import logging
+import logging.config
+import os
+import sys
+from logging.handlers import RotatingFileHandler
+from datetime import datetime
+
+class LoggingConfigurationError(Exception):
+    """
+    Custom exception raised when an error occurs during logging configuration.
+    """
+    pass
+
+def setup_logging(
+    log_file_path: str = None,
+    log_level: int = logging.INFO,
+    console_log_level: int = logging.INFO,
+    log_dir: str = "logs",
+    max_log_file_size: int = 10 * 1024 * 1024,  # 10 MB
+    backup_count: int = 5,
+):
+    """
+    Configures logging for the application.
+
+    Parameters:
+        log_file_path (str):
+            The file path to write log files.
+            If not specified, defaults to 'logs/app.log'.
+        log_level (int):
+            The logging level for file logs.
+            Defaults to logging.INFO.
+        console_log_level (int):
+            The logging level for console logs.
+            Defaults to logging.INFO.
+        log_dir (str):
+            The directory where log files will be stored.
+            Defaults to 'logs'.
+        max_log_file_size (int):
+            The maximum size of a log file in bytes before it is rotated.
+            Defaults to 10 MB.
+        backup_count (int):
+            The number of backup log files to keep when rotating.
+            Defaults to 5.
+
+    Raises:
+        LoggingConfigurationError:
+            If an error occurs during logging setup.
+
+    Dependencies:
+        - Standard logging library.
+        - os, sys, datetime modules.
+
+    Example:
+        setup_logging()
+    """
+
+    try:
+        # Create log directory if it doesn't exist
+        if not os.path.exists(log_dir):
+            os.makedirs(log_dir)
+
+        # Set default log file path if not provided
+        if log_file_path is None:
+            log_file_path = os.path.join(log_dir, 'app.log')
+
+        # Define log formatters
+        standard_formatter = logging.Formatter(
+            fmt='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+
+        simple_formatter = logging.Formatter(
+            fmt='[%(levelname)s] %(message)s'
+        )
+
+        # Create console handler with specified log level
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(console_log_level)
+        console_handler.setFormatter(simple_formatter)
+
+        # Create rotating file handler
+        file_handler = RotatingFileHandler(
+            filename=log_file_path,
+            mode='a',
+            maxBytes=max_log_file_size,
+            backupCount=backup_count,
+            encoding='utf-8',
+            delay=0
+        )
+        file_handler.setLevel(log_level)
+        file_handler.setFormatter(standard_formatter)
+
+        # Get the root logger
+        logger = logging.getLogger()
+        logger.setLevel(logging.DEBUG)  # Capture all levels; handlers will filter
+
+        # Remove any existing handlers
+        logger.handlers = []
+
+        # Add handlers to the root logger
+        logger.addHandler(console_handler)
+        logger.addHandler(file_handler)
+
+        # Optionally, configure module-specific loggers
+        # For example, set different log levels for different modules
+        # logger_module = logging.getLogger('table2json_extractor')
+        # logger_module.setLevel(logging.DEBUG)
+
+        # Silence noisy loggers from third-party libraries if necessary
+        # logging.getLogger('some_noisy_library').setLevel(logging.WARNING)
+
+    except Exception as e:
+        # Raise a custom exception if logging setup fails
+        raise LoggingConfigurationError(f"Failed to configure logging: {e}")
diff --git a/streamlit_app/app/pages/Table2Json_Extractor.py b/streamlit_app/app/pages/Table2Json_Extractor.py
@@ -0,0 +1,250 @@
+# streamlit_app/app/pages/Table2Json_Extractor.py
+
+"""
+Table2Json_Extractor.py
+
+Streamlit page handling user interactions and displaying results for the Table to JSON Extractor app.
+
+This module provides the user interface for uploading documents, specifying extraction parameters,
+processing the documents, and displaying the extracted tables in JSON format.
+
+It relies on other modules for processing, including:
+- data_processing.py
+- extraction_parameters.py
+- structure_interpretation.py
+- user_interface.py
+- validation.py
+- logging_handlers.py
+"""
+
+import streamlit as st
+import logging
+
+# Import necessary modules and functions
+from data_processing import parse_documents
+from extraction_parameters import (
+    ExtractionParameters,
+    TableSelectionCriteria,
+    FormattingRules,
+    ErrorHandlingStrategy,
+    ParserConfiguration,
+    ResourceLimits,
+)
+from structure_interpretation import interpret_table_structure
+from user_interface import process_user_input, process_documents, render_results
+from validation import validate_user_inputs, validate_extracted_data
+from locale_manager import load_locale
+from exceptions import (
+    InvalidUserInputError,
+    DataValidationError,
+    ProcessingError,
+    RenderingError,
+    InvalidParameterError,
+)
+
+# Initialize logging
+logger = logging.getLogger(__name__)
+
+# Set up logging configurations (assuming setup_logging from logging_config.py has been called in main.py)
+
+# Load locale (for internationalization)
+locale = load_locale('en')  # For simplicity, using English locale
+
+def main():
+    st.set_page_config(page_title="Table to JSON Extractor", layout="wide")
+
+    st.title("Table to JSON Extractor")
+    st.write("Upload your MS Word or PDF documents containing tables, specify your extraction parameters, and extract the tables in JSON format.")
+
+    # File uploader
+    uploaded_files = st.file_uploader(
+        "Choose MS Word or PDF files",
+        type=['pdf', 'docx', 'doc'],
+        accept_multiple_files=True
+    )
+
+    # Display uploaded files
+    if uploaded_files:
+        st.write("Uploaded files:")
+        for file in uploaded_files:
+            st.write(f"- {file.name}")
+
+    # Extraction parameters
+    st.header("Extraction Parameters")
+
+    # Table Selection Criteria
+    st.subheader("Table Selection Criteria")
+
+    selection_method = st.selectbox(
+        "Select Table Selection Method",
+        ['Indexing', 'Keyword', 'Regex', 'Criteria']
+    )
+
+    indices = None
+    keywords = None
+    regex_patterns = None
+    row_conditions = None
+    column_conditions = None
+
+    if selection_method == 'Indexing':
+        indices_input = st.text_input("Enter Table Indices (comma-separated)", value="1")
+        try:
+            indices = [int(i.strip()) - 1 for i in indices_input.split(',')]  # Adjusted to zero-based index
+        except ValueError:
+            st.error("Indices must be integers separated by commas.")
+            st.stop()
+    elif selection_method == 'Keyword':
+        keywords_input = st.text_input("Enter Keywords (comma-separated)")
+        keywords = [k.strip() for k in keywords_input.split(',') if k.strip()]
+    elif selection_method == 'Regex':
+        regex_input = st.text_input("Enter Regular Expressions (comma-separated)")
+        regex_patterns = [r.strip() for r in regex_input.split(',') if r.strip()]
+    elif selection_method == 'Criteria':
+        st.info("Criteria-based selection is not implemented in this interface yet.")
+        st.stop()
+
+    # Formatting Rules
+    st.subheader("Formatting Rules")
+
+    preserve_styles = st.checkbox("Preserve Text Styles (e.g., bold, italic)")
+    date_format = st.text_input("Date Format", value="%Y-%m-%d")
+    number_format = st.text_input("Number Format (e.g., {:.2f} for two decimal places)", value="")
+    encoding = st.text_input("Text Encoding", value="utf-8")
+    placeholder_for_missing = st.text_input("Placeholder for Missing Data", value="")
+
+    # Error Handling Strategy
+    st.subheader("Error Handling Strategy")
+
+    on_parsing_error = st.selectbox(
+        "On Parsing Error",
+        ['Skip', 'Abort', 'Log'],
+        index=2  # Default to 'Log'
+    )
+    on_validation_error = st.selectbox(
+        "On Validation Error",
+        ['Correct', 'Omit', 'Prompt', 'Abort'],
+        index=1  # Default to 'Omit'
+    )
+
+    # Parser Configuration
+    st.subheader("Parser Configuration")
+
+    ocr_enabled = st.checkbox("Enable OCR for Scanned PDFs")
+    language = st.text_input("Document Language", value="en")
+
+    # Resource Limits
+    st.subheader("Resource Limits")
+
+    max_memory = st.number_input("Max Memory (MB)", min_value=0, value=0)
+    max_time = st.number_input("Max Time (seconds)", min_value=0, value=0)
+    max_cpu_usage = st.number_input("Max CPU Usage (%)", min_value=0, max_value=100, value=0)
+
+    # Process Button
+    if st.button("Process Documents"):
+        if not uploaded_files:
+            st.error("Please upload at least one document.")
+            st.stop()
+
+        # Collect user inputs
+        user_inputs = {
+            'table_selection': {
+                'method': selection_method.lower(),
+                'indices': indices,
+                'keywords': keywords,
+                'regex_patterns': regex_patterns,
+                'row_conditions': row_conditions,
+                'column_conditions': column_conditions
+            },
+            'formatting_rules': {
+                'preserve_styles': preserve_styles,
+                'date_format': date_format,
+                'number_format': number_format if number_format else None,
+                'encoding': encoding,
+                'placeholder_for_missing': placeholder_for_missing if placeholder_for_missing else None
+            },
+            'error_handling': {
+                'on_parsing_error': on_parsing_error.lower(),
+                'on_validation_error': on_validation_error.lower(),
+                'fallback_mechanisms': []  # Placeholder
+            },
+            'parser_config': {
+                'ocr_enabled': ocr_enabled,
+                'language': language,
+                'resource_limits': {
+                    'max_memory': int(max_memory) if max_memory > 0 else None,
+                    'max_time': int(max_time) if max_time > 0 else None,
+                    'max_cpu_usage': int(max_cpu_usage) if max_cpu_usage > 0 else None
+                }
+            },
+            'data_types': {}  # Could be expanded to accept user inputs
+        }
+
+        # Process user inputs
+        try:
+            extraction_parameters = process_user_input(user_inputs)
+        except InvalidUserInputError as e:
+            st.error(f"Invalid user input: {e}")
+            logger.error(f"Invalid user input: {e}")
+            st.stop()
+
+        # Convert uploaded_files into file paths
+        import tempfile
+        import shutil
+
+        file_paths = []
+        temp_dirs = []
+
+        try:
+            for uploaded_file in uploaded_files:
+                # Save uploaded file to a temporary directory
+                temp_dir = tempfile.mkdtemp()
+                temp_dirs.append(temp_dir)
+                file_name = uploaded_file.name
+                temp_file_path = os.path.join(temp_dir, file_name)
+
+                with open(temp_file_path, 'wb') as f:
+                    f.write(uploaded_file.read())
+
+                file_paths.append(temp_file_path)
+                logger.debug(f"Saved uploaded file '{uploaded_file.name}' to temporary file '{temp_file_path}'.")
+
+            # Process the documents
+            try:
+                extracted_data = process_documents(file_paths, extraction_parameters)
+            except ProcessingError as e:
+                st.error(f"Error processing documents: {e}")
+                logger.error(f"Error processing documents: {e}")
+                st.stop()
+
+            # Render results
+            try:
+                output_format = 'json'  # Currently supporting JSON output
+                rendered_data = render_results(extracted_data, output_format)
+                st.success("Documents processed successfully!")
+                st.subheader("Extracted Tables in JSON Format")
+                st.code(rendered_data, language='json')
+
+                # Provide a download button
+                st.download_button(
+                    label="Download JSON",
+                    data=rendered_data,
+                    file_name='extracted_tables.json',
+                    mime='application/json'
+                )
+
+            except RenderingError as e:
+                st.error(f"Error rendering results: {e}")
+                logger.error(f"Error rendering results: {e}")
+                st.stop()
+
+        finally:
+            # Clean up temporary files and directories
+            for temp_dir in temp_dirs:
+                try:
+                    shutil.rmtree(temp_dir)
+                    logger.debug(f"Deleted temporary directory '{temp_dir}'.")
+                except OSError as e:
+                    logger.warning(f"Error deleting temporary directory '{temp_dir}': {e}")
+
+if __name__ == '__main__':
+    main()