Initial commit

PedroDnT · Jul 7, 2024 · e5a175b · e5a175b
commit e5a175b
Show file tree

Hide file tree

Showing 103 changed files with 5,558,219 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/.ipynb_checkpoints/tester-checkpoint.ipynb b/.ipynb_checkpoints/tester-checkpoint.ipynb
@@ -0,0 +1,58 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mRunning cells with 'Python 3.12.4' requires the ipykernel package.\n",
+      "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
+      "\u001b[1;31mCommand: '/usr/local/bin/python3 -m pip install ipykernel -U --user --force-reinstall'"
+     ]
+    }
+   ],
+   "source": [
+    "from handler import *\n",
+    "suz = 13986\n",
+    "cf = return_df(CD_CVM=suz, statment='CF')\n",
+    "inc = return_df(CD_CVM=suz, statment='IS')\n",
+    "bs = return_df(CD_CVM=suz, statment='BS')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ref=read_files_ref()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "new-db-PuNDkLGb-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/__init__.py b/__init__.py
diff --git a/__pycache__/call.cpython-312.pyc b/__pycache__/call.cpython-312.pyc
diff --git a/__pycache__/data_retriever.cpython-312.pyc b/__pycache__/data_retriever.cpython-312.pyc
diff --git a/__pycache__/fetcher.cpython-312.pyc b/__pycache__/fetcher.cpython-312.pyc
diff --git a/__pycache__/financial_analyzer.cpython-312.pyc b/__pycache__/financial_analyzer.cpython-312.pyc
diff --git a/__pycache__/handler.cpython-310.pyc b/__pycache__/handler.cpython-310.pyc
diff --git a/__pycache__/handler.cpython-312.pyc b/__pycache__/handler.cpython-312.pyc
diff --git a/__pycache__/process_and_upload.cpython-312.pyc b/__pycache__/process_and_upload.cpython-312.pyc
diff --git a/__pycache__/processor.cpython-312.pyc b/__pycache__/processor.cpython-312.pyc
diff --git a/call.py b/call.py
@@ -0,0 +1,228 @@
+import os
+import json
+from typing import Dict, Any, List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnableSequence
+from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
+import pandas as pd
+import numpy as np
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_community.callbacks import get_openai_callback
+from fetcher import execute_query_batch, get_distinct_cd_cvm
+
+def get_financial_statements_batch(cd_cvm_list: List[str]) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
+    income_statements = execute_query_batch(cd_cvm_list, 'ist')
+    balance_sheets = execute_query_batch(cd_cvm_list, 'bs')
+    cash_flows = execute_query_batch(cd_cvm_list, 'cf')
+
+    return {
+        cd_cvm: (income_statements.get(cd_cvm, pd.DataFrame()),
+                 balance_sheets.get(cd_cvm, pd.DataFrame()),
+                 cash_flows.get(cd_cvm, pd.DataFrame()))
+        for cd_cvm in cd_cvm_list
+    }
+
+def calculate_actual_results(income_statement: pd.DataFrame) -> List[Tuple[str, int]]:
+    earnings_column = 'Resultado Líquido das Operações Continuadas'
+    results = []
+
+    if 'DS_CONTA' not in income_statement.columns:
+        raise ValueError("Expected 'DS_CONTA' column in income statement")
+
+    earnings_rows = income_statement[income_statement['DS_CONTA'] == earnings_column]
+
+    date_columns = [col for col in earnings_rows.columns if col.startswith('20') and col.endswith('-12-31')]
+    sorted_dates = sorted(date_columns)
+
+    for i in range(1, len(sorted_dates)):
+        current_earnings = earnings_rows[sorted_dates[i]].values[0]
+        previous_earnings = earnings_rows[sorted_dates[i-1]].values[0]
+
+        if pd.notnull(current_earnings) and pd.notnull(previous_earnings):
+            result = 1 if current_earnings > previous_earnings else -1
+            period = sorted_dates[i]
+
+            results.append((period, result))
+
+    return results
+
+def create_prompt_template() -> ChatPromptTemplate:
+    template = """
+    As a financial expert, analyze the provided financial statements and predict future earnings for the specified target period. You MUST provide analysis and prediction for the target period. Follow this structure:
+
+    1. Trend Analysis (Panel A): Analyze relevant trends over the past three years.
+    2. Ratio Analysis (Panel B): Calculate and analyze  financial ratios you consider relevant and provide economic interpretations of the computed ratios interpret them and implications for future earnings.
+    3. Rationale (Panel C): Summarize your analyzes on trend and ration to make a prediction. Explain your prediction reasoning concisely.
+    4. Prediction: State the earnings direction (increase/decrease), magnitude (large/moderate/small), and confidence (0.0-1.0).
+
+    Financial data: {financial_data}
+    Target period: {target_period}
+
+    Provide your analysis in this format for the target period:
+    Panel A - Trend Analysis: [Brief trend analysis]
+    Panel B - Ratio Analysis: [Brief ratio analysis]
+    Panel C - Rationale: [Concise rationale for prediction]
+    Direction: [increase/decrease]
+    Magnitude: [large/moderate/small]
+    Confidence: [0.00 to 1.00]
+
+    Note: Direction will be interpreted as 1 for increase and -1 for decrease.
+    You MUST provide all sections (Panel A, B, C, Direction, Magnitude, and Confidence) for the target period.
+    You MUST make a prediction and DIrection cant never be 0.
+    If data is limited, make reasonable assumptions based on available information and state these assumptions in your analysis.
+    Try to maintain response under 500 tokens.
+    """
+    return ChatPromptTemplate.from_template(template)
+
+def get_llm(model_name: str, **kwargs) -> ChatOpenAI:
+    base_kwargs = {
+        "temperature": 0,
+        "model_kwargs": {"logprobs": True, "top_p": 1},
+        **kwargs
+    }
+
+    if model_name.startswith('openai/'):
+        return ChatOpenAI(model_name=model_name.split('/')[-1], **base_kwargs)
+    else:
+        return ChatOpenAI(
+            model=model_name,
+            openai_api_base="https://openrouter.ai/api/v1",
+            openai_api_key=os.getenv("OPENROUTER_API_KEY"),
+            **base_kwargs
+        )
+
+def get_financial_prediction(financial_data: str, target_period: str, chain: RunnableSequence) -> Dict[str, Any]:
+    with get_openai_callback() as cb:
+        response = chain.invoke({"financial_data": financial_data, "target_period": target_period})
+    prediction = {
+        'trend_analysis': 'Analysis not provided',
+        'ratio_analysis': 'Analysis not provided',
+        'rationale': 'Rationale not provided',
+        'direction': 0,
+        'magnitude': 'unknown',
+        'confidence': 0.0
+    }
+    # Check if response is an AIMessage object
+    if hasattr(response, 'content'):
+        response_text = response.content
+    else:
+        response_text = str(response)
+    # Split the response into sections
+    sections = response_text.split('Panel')
+    for section in sections:
+        if 'A - Trend Analysis:' in section:
+            prediction['trend_analysis'] = section.split('A - Trend Analysis:', 1)[1].strip()
+        elif 'B - Ratio Analysis:' in section:
+            prediction['ratio_analysis'] = section.split('B - Ratio Analysis:', 1)[1].strip()
+        elif 'C - Rationale:' in section:
+            rationale_section = section.split('C - Rationale:', 1)[1].strip()
+            prediction['rationale'] = rationale_section
+            # Extract direction, magnitude, and confidence from rationale
+            lines = rationale_section.split('\n')
+            for line in lines:
+                if line.startswith('- **Direction**:'):
+                    direction = line.split(':', 1)[1].strip().lower()
+                    prediction['direction'] = 1 if 'increase' in direction else (-1 if 'decrease' in direction else 0)
+                elif line.startswith('- **Magnitude**:'):
+                    prediction['magnitude'] = line.split(':', 1)[1].strip().lower()
+                elif line.startswith('- **Confidence**:'):
+                    try:
+                        prediction['confidence'] = float(line.split(':', 1)[1].strip())
+                    except ValueError:
+                        prediction['confidence'] = 0.0
+    prediction['token_usage'] = {
+        'total_tokens': cb.total_tokens,
+    }
+    return prediction
+
+def run_predictions(cd_cvm_list: List[str], models_to_test: List[tuple]) -> pd.DataFrame:
+    results = []
+    # Get financial statements for all companies at once
+    financial_statements_batch = get_financial_statements_batch(cd_cvm_list)
+    for model_name, model_kwargs in models_to_test:
+        print(f"\nTesting model: {model_name}")
+        llm = get_llm(model_name, **model_kwargs)
+        prompt = create_prompt_template()
+        chain = prompt | llm
+        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
+            future_to_prediction = {}
+            for cd_cvm, (income_statement, balance_sheet, cash_flow) in financial_statements_batch.items():
+                actual_results = calculate_actual_results(income_statement)
+
+                # Prepare financial data for all periods
+                financial_statements = {
+                    'income_statement': income_statement.to_dict(orient='records'),
+                    'balance_sheet': balance_sheet.to_dict(orient='records'),
+                    'cash_flow_statement': cash_flow.to_dict(orient='records')
+                }
+                financial_data = json.dumps(financial_statements)
+
+                for target_period, actual_result in actual_results:
+                    future = executor.submit(get_financial_prediction, financial_data, target_period, chain)
+                    future_to_prediction[(cd_cvm, target_period, actual_result)] = future
+            for (cd_cvm, target_period, actual_result), future in future_to_prediction.items():
+                try:
+                    prediction = future.result()
+                    results.append({
+                        'Model': model_name,
+                        'TREND ANALYSIS': prediction.get('trend_analysis', ''),
+                        'RATIO ANALYSIS': prediction.get('ratio_analysis', ''),
+                        'RATIONALE': prediction.get('rationale', ''),
+                        'DIRECTION': prediction.get('direction', ''),
+                        'MAGNITUDE': prediction.get('magnitude', ''),
+                        'CONFIDENCE LEVEL': prediction.get('confidence', ''),
+                        'ACTUAL DIRECTION': actual_result,
+                        'CD_CVM': cd_cvm,
+                        'TARGET PERIOD': target_period
+                    })
+                except Exception as exc:
+                    print(f'Generated an exception for {cd_cvm}, {target_period}: {exc}')
+                    print(f'Exception type: {type(exc)}')
+                    print(f'Exception details: {str(exc)}')
+    return pd.DataFrame(results)
+
+def calculate_metrics(predictions: List[int], actual_results: List[int]) -> Dict[str, Any]:
+    if not predictions or not actual_results:
+        return {'accuracy': 0.0, 'f1_score': 0.0}
+
+    accuracy = float(accuracy_score(actual_results, predictions))
+
+    # Check if it's binary or multiclass
+    unique_labels = set(actual_results + predictions)
+    if len(unique_labels) <= 2:
+        f1 = float(f1_score(actual_results, predictions, average='binary'))
+    else:
+        f1 = float(f1_score(actual_results, predictions, average='weighted'))
+
+    # Calculate confusion matrix
+    cm = confusion_matrix(actual_results, predictions)
+
+    return {
+        'accuracy': accuracy,
+        'f1_score': f1,
+        'confusion_matrix': cm.tolist(),  # Convert to list for JSON serialization
+        'support': {
+            'increase': int(np.sum(np.array(actual_results) == 1)),
+            'decrease': int(np.sum(np.array(actual_results) == -1))
+        }
+    }
+
+# Main execution
+if __name__ == "__main__":
+    cd_cvm_list = get_distinct_cd_cvm()
+
+    models_to_test = [
+        ('openai/gpt-4-turbo', {}),
+        ('anthropic/claude-3.5-sonnet', {})
+    ]
+
+    results_df = run_predictions(cd_cvm_list, models_to_test)
+    print(results_df.to_string())
+
+    # Calculate and print metrics
+    predictions = results_df['DIRECTION'].tolist()
+    actual_results = results_df['ACTUAL DIRECTION'].tolist()
+    metrics = calculate_metrics(predictions, actual_results)
+    print("\nMetrics:")
+    print(json.dumps(metrics, indent=2))
diff --git a/cvm_db.code-workspace b/cvm_db.code-workspace
@@ -0,0 +1,8 @@
+{
+	"folders": [
+		{
+			"path": "."
+		}
+	],
+	"settings": {}
+}