Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
PedroDnT committed Jul 7, 2024
0 parents commit e5a175b
Show file tree
Hide file tree
Showing 103 changed files with 5,558,219 additions and 0 deletions.
Binary file added .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.env
58 changes: 58 additions & 0 deletions .ipynb_checkpoints/tester-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mRunning cells with 'Python 3.12.4' requires the ipykernel package.\n",
"\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
"\u001b[1;31mCommand: '/usr/local/bin/python3 -m pip install ipykernel -U --user --force-reinstall'"
]
}
],
"source": [
"from handler import *\n",
"suz = 13986\n",
"cf = return_df(CD_CVM=suz, statment='CF')\n",
"inc = return_df(CD_CVM=suz, statment='IS')\n",
"bs = return_df(CD_CVM=suz, statment='BS')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ref=read_files_ref()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "new-db-PuNDkLGb-py3.12",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Empty file added __init__.py
Empty file.
Binary file added __pycache__/call.cpython-312.pyc
Binary file not shown.
Binary file added __pycache__/data_retriever.cpython-312.pyc
Binary file not shown.
Binary file added __pycache__/fetcher.cpython-312.pyc
Binary file not shown.
Binary file added __pycache__/financial_analyzer.cpython-312.pyc
Binary file not shown.
Binary file added __pycache__/handler.cpython-310.pyc
Binary file not shown.
Binary file added __pycache__/handler.cpython-312.pyc
Binary file not shown.
Binary file added __pycache__/process_and_upload.cpython-312.pyc
Binary file not shown.
Binary file added __pycache__/processor.cpython-312.pyc
Binary file not shown.
228 changes: 228 additions & 0 deletions call.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
import os
import json
from typing import Dict, Any, List, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableSequence
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import pandas as pd
import numpy as np
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.callbacks import get_openai_callback
from fetcher import execute_query_batch, get_distinct_cd_cvm

def get_financial_statements_batch(cd_cvm_list: List[str]) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
income_statements = execute_query_batch(cd_cvm_list, 'ist')
balance_sheets = execute_query_batch(cd_cvm_list, 'bs')
cash_flows = execute_query_batch(cd_cvm_list, 'cf')

return {
cd_cvm: (income_statements.get(cd_cvm, pd.DataFrame()),
balance_sheets.get(cd_cvm, pd.DataFrame()),
cash_flows.get(cd_cvm, pd.DataFrame()))
for cd_cvm in cd_cvm_list
}

def calculate_actual_results(income_statement: pd.DataFrame) -> List[Tuple[str, int]]:
earnings_column = 'Resultado Líquido das Operações Continuadas'
results = []

if 'DS_CONTA' not in income_statement.columns:
raise ValueError("Expected 'DS_CONTA' column in income statement")

earnings_rows = income_statement[income_statement['DS_CONTA'] == earnings_column]

date_columns = [col for col in earnings_rows.columns if col.startswith('20') and col.endswith('-12-31')]
sorted_dates = sorted(date_columns)

for i in range(1, len(sorted_dates)):
current_earnings = earnings_rows[sorted_dates[i]].values[0]
previous_earnings = earnings_rows[sorted_dates[i-1]].values[0]

if pd.notnull(current_earnings) and pd.notnull(previous_earnings):
result = 1 if current_earnings > previous_earnings else -1
period = sorted_dates[i]

results.append((period, result))

return results

def create_prompt_template() -> ChatPromptTemplate:
template = """
As a financial expert, analyze the provided financial statements and predict future earnings for the specified target period. You MUST provide analysis and prediction for the target period. Follow this structure:
1. Trend Analysis (Panel A): Analyze relevant trends over the past three years.
2. Ratio Analysis (Panel B): Calculate and analyze financial ratios you consider relevant and provide economic interpretations of the computed ratios interpret them and implications for future earnings.
3. Rationale (Panel C): Summarize your analyzes on trend and ration to make a prediction. Explain your prediction reasoning concisely.
4. Prediction: State the earnings direction (increase/decrease), magnitude (large/moderate/small), and confidence (0.0-1.0).
Financial data: {financial_data}
Target period: {target_period}
Provide your analysis in this format for the target period:
Panel A - Trend Analysis: [Brief trend analysis]
Panel B - Ratio Analysis: [Brief ratio analysis]
Panel C - Rationale: [Concise rationale for prediction]
Direction: [increase/decrease]
Magnitude: [large/moderate/small]
Confidence: [0.00 to 1.00]
Note: Direction will be interpreted as 1 for increase and -1 for decrease.
You MUST provide all sections (Panel A, B, C, Direction, Magnitude, and Confidence) for the target period.
You MUST make a prediction and DIrection cant never be 0.
If data is limited, make reasonable assumptions based on available information and state these assumptions in your analysis.
Try to maintain response under 500 tokens.
"""
return ChatPromptTemplate.from_template(template)

def get_llm(model_name: str, **kwargs) -> ChatOpenAI:
base_kwargs = {
"temperature": 0,
"model_kwargs": {"logprobs": True, "top_p": 1},
**kwargs
}

if model_name.startswith('openai/'):
return ChatOpenAI(model_name=model_name.split('/')[-1], **base_kwargs)
else:
return ChatOpenAI(
model=model_name,
openai_api_base="https://openrouter.ai/api/v1",
openai_api_key=os.getenv("OPENROUTER_API_KEY"),
**base_kwargs
)

def get_financial_prediction(financial_data: str, target_period: str, chain: RunnableSequence) -> Dict[str, Any]:
with get_openai_callback() as cb:
response = chain.invoke({"financial_data": financial_data, "target_period": target_period})
prediction = {
'trend_analysis': 'Analysis not provided',
'ratio_analysis': 'Analysis not provided',
'rationale': 'Rationale not provided',
'direction': 0,
'magnitude': 'unknown',
'confidence': 0.0
}
# Check if response is an AIMessage object
if hasattr(response, 'content'):
response_text = response.content
else:
response_text = str(response)
# Split the response into sections
sections = response_text.split('Panel')
for section in sections:
if 'A - Trend Analysis:' in section:
prediction['trend_analysis'] = section.split('A - Trend Analysis:', 1)[1].strip()
elif 'B - Ratio Analysis:' in section:
prediction['ratio_analysis'] = section.split('B - Ratio Analysis:', 1)[1].strip()
elif 'C - Rationale:' in section:
rationale_section = section.split('C - Rationale:', 1)[1].strip()
prediction['rationale'] = rationale_section
# Extract direction, magnitude, and confidence from rationale
lines = rationale_section.split('\n')
for line in lines:
if line.startswith('- **Direction**:'):
direction = line.split(':', 1)[1].strip().lower()
prediction['direction'] = 1 if 'increase' in direction else (-1 if 'decrease' in direction else 0)
elif line.startswith('- **Magnitude**:'):
prediction['magnitude'] = line.split(':', 1)[1].strip().lower()
elif line.startswith('- **Confidence**:'):
try:
prediction['confidence'] = float(line.split(':', 1)[1].strip())
except ValueError:
prediction['confidence'] = 0.0
prediction['token_usage'] = {
'total_tokens': cb.total_tokens,
}
return prediction

def run_predictions(cd_cvm_list: List[str], models_to_test: List[tuple]) -> pd.DataFrame:
results = []
# Get financial statements for all companies at once
financial_statements_batch = get_financial_statements_batch(cd_cvm_list)
for model_name, model_kwargs in models_to_test:
print(f"\nTesting model: {model_name}")
llm = get_llm(model_name, **model_kwargs)
prompt = create_prompt_template()
chain = prompt | llm
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
future_to_prediction = {}
for cd_cvm, (income_statement, balance_sheet, cash_flow) in financial_statements_batch.items():
actual_results = calculate_actual_results(income_statement)

# Prepare financial data for all periods
financial_statements = {
'income_statement': income_statement.to_dict(orient='records'),
'balance_sheet': balance_sheet.to_dict(orient='records'),
'cash_flow_statement': cash_flow.to_dict(orient='records')
}
financial_data = json.dumps(financial_statements)

for target_period, actual_result in actual_results:
future = executor.submit(get_financial_prediction, financial_data, target_period, chain)
future_to_prediction[(cd_cvm, target_period, actual_result)] = future
for (cd_cvm, target_period, actual_result), future in future_to_prediction.items():
try:
prediction = future.result()
results.append({
'Model': model_name,
'TREND ANALYSIS': prediction.get('trend_analysis', ''),
'RATIO ANALYSIS': prediction.get('ratio_analysis', ''),
'RATIONALE': prediction.get('rationale', ''),
'DIRECTION': prediction.get('direction', ''),
'MAGNITUDE': prediction.get('magnitude', ''),
'CONFIDENCE LEVEL': prediction.get('confidence', ''),
'ACTUAL DIRECTION': actual_result,
'CD_CVM': cd_cvm,
'TARGET PERIOD': target_period
})
except Exception as exc:
print(f'Generated an exception for {cd_cvm}, {target_period}: {exc}')
print(f'Exception type: {type(exc)}')
print(f'Exception details: {str(exc)}')
return pd.DataFrame(results)

def calculate_metrics(predictions: List[int], actual_results: List[int]) -> Dict[str, Any]:
if not predictions or not actual_results:
return {'accuracy': 0.0, 'f1_score': 0.0}

accuracy = float(accuracy_score(actual_results, predictions))

# Check if it's binary or multiclass
unique_labels = set(actual_results + predictions)
if len(unique_labels) <= 2:
f1 = float(f1_score(actual_results, predictions, average='binary'))
else:
f1 = float(f1_score(actual_results, predictions, average='weighted'))

# Calculate confusion matrix
cm = confusion_matrix(actual_results, predictions)

return {
'accuracy': accuracy,
'f1_score': f1,
'confusion_matrix': cm.tolist(), # Convert to list for JSON serialization
'support': {
'increase': int(np.sum(np.array(actual_results) == 1)),
'decrease': int(np.sum(np.array(actual_results) == -1))
}
}

# Main execution
if __name__ == "__main__":
cd_cvm_list = get_distinct_cd_cvm()

models_to_test = [
('openai/gpt-4-turbo', {}),
('anthropic/claude-3.5-sonnet', {})
]

results_df = run_predictions(cd_cvm_list, models_to_test)
print(results_df.to_string())

# Calculate and print metrics
predictions = results_df['DIRECTION'].tolist()
actual_results = results_df['ACTUAL DIRECTION'].tolist()
metrics = calculate_metrics(predictions, actual_results)
print("\nMetrics:")
print(json.dumps(metrics, indent=2))
8 changes: 8 additions & 0 deletions cvm_db.code-workspace
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"folders": [
{
"path": "."
}
],
"settings": {}
}
Loading

0 comments on commit e5a175b

Please sign in to comment.