-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit e5a175b
Showing
103 changed files
with
5,558,219 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "", | ||
"evalue": "", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[1;31mRunning cells with 'Python 3.12.4' requires the ipykernel package.\n", | ||
"\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", | ||
"\u001b[1;31mCommand: '/usr/local/bin/python3 -m pip install ipykernel -U --user --force-reinstall'" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from handler import *\n", | ||
"suz = 13986\n", | ||
"cf = return_df(CD_CVM=suz, statment='CF')\n", | ||
"inc = return_df(CD_CVM=suz, statment='IS')\n", | ||
"bs = return_df(CD_CVM=suz, statment='BS')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ref=read_files_ref()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "new-db-PuNDkLGb-py3.12", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.14" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
import os | ||
import json | ||
from typing import Dict, Any, List, Tuple | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
from langchain_core.prompts import ChatPromptTemplate | ||
from langchain_core.runnables import RunnableSequence | ||
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix | ||
import pandas as pd | ||
import numpy as np | ||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings | ||
from langchain_community.callbacks import get_openai_callback | ||
from fetcher import execute_query_batch, get_distinct_cd_cvm | ||
|
||
def get_financial_statements_batch(cd_cvm_list: List[str]) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]: | ||
income_statements = execute_query_batch(cd_cvm_list, 'ist') | ||
balance_sheets = execute_query_batch(cd_cvm_list, 'bs') | ||
cash_flows = execute_query_batch(cd_cvm_list, 'cf') | ||
|
||
return { | ||
cd_cvm: (income_statements.get(cd_cvm, pd.DataFrame()), | ||
balance_sheets.get(cd_cvm, pd.DataFrame()), | ||
cash_flows.get(cd_cvm, pd.DataFrame())) | ||
for cd_cvm in cd_cvm_list | ||
} | ||
|
||
def calculate_actual_results(income_statement: pd.DataFrame) -> List[Tuple[str, int]]: | ||
earnings_column = 'Resultado Líquido das Operações Continuadas' | ||
results = [] | ||
|
||
if 'DS_CONTA' not in income_statement.columns: | ||
raise ValueError("Expected 'DS_CONTA' column in income statement") | ||
|
||
earnings_rows = income_statement[income_statement['DS_CONTA'] == earnings_column] | ||
|
||
date_columns = [col for col in earnings_rows.columns if col.startswith('20') and col.endswith('-12-31')] | ||
sorted_dates = sorted(date_columns) | ||
|
||
for i in range(1, len(sorted_dates)): | ||
current_earnings = earnings_rows[sorted_dates[i]].values[0] | ||
previous_earnings = earnings_rows[sorted_dates[i-1]].values[0] | ||
|
||
if pd.notnull(current_earnings) and pd.notnull(previous_earnings): | ||
result = 1 if current_earnings > previous_earnings else -1 | ||
period = sorted_dates[i] | ||
|
||
results.append((period, result)) | ||
|
||
return results | ||
|
||
def create_prompt_template() -> ChatPromptTemplate: | ||
template = """ | ||
As a financial expert, analyze the provided financial statements and predict future earnings for the specified target period. You MUST provide analysis and prediction for the target period. Follow this structure: | ||
1. Trend Analysis (Panel A): Analyze relevant trends over the past three years. | ||
2. Ratio Analysis (Panel B): Calculate and analyze financial ratios you consider relevant and provide economic interpretations of the computed ratios interpret them and implications for future earnings. | ||
3. Rationale (Panel C): Summarize your analyzes on trend and ration to make a prediction. Explain your prediction reasoning concisely. | ||
4. Prediction: State the earnings direction (increase/decrease), magnitude (large/moderate/small), and confidence (0.0-1.0). | ||
Financial data: {financial_data} | ||
Target period: {target_period} | ||
Provide your analysis in this format for the target period: | ||
Panel A - Trend Analysis: [Brief trend analysis] | ||
Panel B - Ratio Analysis: [Brief ratio analysis] | ||
Panel C - Rationale: [Concise rationale for prediction] | ||
Direction: [increase/decrease] | ||
Magnitude: [large/moderate/small] | ||
Confidence: [0.00 to 1.00] | ||
Note: Direction will be interpreted as 1 for increase and -1 for decrease. | ||
You MUST provide all sections (Panel A, B, C, Direction, Magnitude, and Confidence) for the target period. | ||
You MUST make a prediction and DIrection cant never be 0. | ||
If data is limited, make reasonable assumptions based on available information and state these assumptions in your analysis. | ||
Try to maintain response under 500 tokens. | ||
""" | ||
return ChatPromptTemplate.from_template(template) | ||
|
||
def get_llm(model_name: str, **kwargs) -> ChatOpenAI: | ||
base_kwargs = { | ||
"temperature": 0, | ||
"model_kwargs": {"logprobs": True, "top_p": 1}, | ||
**kwargs | ||
} | ||
|
||
if model_name.startswith('openai/'): | ||
return ChatOpenAI(model_name=model_name.split('/')[-1], **base_kwargs) | ||
else: | ||
return ChatOpenAI( | ||
model=model_name, | ||
openai_api_base="https://openrouter.ai/api/v1", | ||
openai_api_key=os.getenv("OPENROUTER_API_KEY"), | ||
**base_kwargs | ||
) | ||
|
||
def get_financial_prediction(financial_data: str, target_period: str, chain: RunnableSequence) -> Dict[str, Any]: | ||
with get_openai_callback() as cb: | ||
response = chain.invoke({"financial_data": financial_data, "target_period": target_period}) | ||
prediction = { | ||
'trend_analysis': 'Analysis not provided', | ||
'ratio_analysis': 'Analysis not provided', | ||
'rationale': 'Rationale not provided', | ||
'direction': 0, | ||
'magnitude': 'unknown', | ||
'confidence': 0.0 | ||
} | ||
# Check if response is an AIMessage object | ||
if hasattr(response, 'content'): | ||
response_text = response.content | ||
else: | ||
response_text = str(response) | ||
# Split the response into sections | ||
sections = response_text.split('Panel') | ||
for section in sections: | ||
if 'A - Trend Analysis:' in section: | ||
prediction['trend_analysis'] = section.split('A - Trend Analysis:', 1)[1].strip() | ||
elif 'B - Ratio Analysis:' in section: | ||
prediction['ratio_analysis'] = section.split('B - Ratio Analysis:', 1)[1].strip() | ||
elif 'C - Rationale:' in section: | ||
rationale_section = section.split('C - Rationale:', 1)[1].strip() | ||
prediction['rationale'] = rationale_section | ||
# Extract direction, magnitude, and confidence from rationale | ||
lines = rationale_section.split('\n') | ||
for line in lines: | ||
if line.startswith('- **Direction**:'): | ||
direction = line.split(':', 1)[1].strip().lower() | ||
prediction['direction'] = 1 if 'increase' in direction else (-1 if 'decrease' in direction else 0) | ||
elif line.startswith('- **Magnitude**:'): | ||
prediction['magnitude'] = line.split(':', 1)[1].strip().lower() | ||
elif line.startswith('- **Confidence**:'): | ||
try: | ||
prediction['confidence'] = float(line.split(':', 1)[1].strip()) | ||
except ValueError: | ||
prediction['confidence'] = 0.0 | ||
prediction['token_usage'] = { | ||
'total_tokens': cb.total_tokens, | ||
} | ||
return prediction | ||
|
||
def run_predictions(cd_cvm_list: List[str], models_to_test: List[tuple]) -> pd.DataFrame: | ||
results = [] | ||
# Get financial statements for all companies at once | ||
financial_statements_batch = get_financial_statements_batch(cd_cvm_list) | ||
for model_name, model_kwargs in models_to_test: | ||
print(f"\nTesting model: {model_name}") | ||
llm = get_llm(model_name, **model_kwargs) | ||
prompt = create_prompt_template() | ||
chain = prompt | llm | ||
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: | ||
future_to_prediction = {} | ||
for cd_cvm, (income_statement, balance_sheet, cash_flow) in financial_statements_batch.items(): | ||
actual_results = calculate_actual_results(income_statement) | ||
|
||
# Prepare financial data for all periods | ||
financial_statements = { | ||
'income_statement': income_statement.to_dict(orient='records'), | ||
'balance_sheet': balance_sheet.to_dict(orient='records'), | ||
'cash_flow_statement': cash_flow.to_dict(orient='records') | ||
} | ||
financial_data = json.dumps(financial_statements) | ||
|
||
for target_period, actual_result in actual_results: | ||
future = executor.submit(get_financial_prediction, financial_data, target_period, chain) | ||
future_to_prediction[(cd_cvm, target_period, actual_result)] = future | ||
for (cd_cvm, target_period, actual_result), future in future_to_prediction.items(): | ||
try: | ||
prediction = future.result() | ||
results.append({ | ||
'Model': model_name, | ||
'TREND ANALYSIS': prediction.get('trend_analysis', ''), | ||
'RATIO ANALYSIS': prediction.get('ratio_analysis', ''), | ||
'RATIONALE': prediction.get('rationale', ''), | ||
'DIRECTION': prediction.get('direction', ''), | ||
'MAGNITUDE': prediction.get('magnitude', ''), | ||
'CONFIDENCE LEVEL': prediction.get('confidence', ''), | ||
'ACTUAL DIRECTION': actual_result, | ||
'CD_CVM': cd_cvm, | ||
'TARGET PERIOD': target_period | ||
}) | ||
except Exception as exc: | ||
print(f'Generated an exception for {cd_cvm}, {target_period}: {exc}') | ||
print(f'Exception type: {type(exc)}') | ||
print(f'Exception details: {str(exc)}') | ||
return pd.DataFrame(results) | ||
|
||
def calculate_metrics(predictions: List[int], actual_results: List[int]) -> Dict[str, Any]: | ||
if not predictions or not actual_results: | ||
return {'accuracy': 0.0, 'f1_score': 0.0} | ||
|
||
accuracy = float(accuracy_score(actual_results, predictions)) | ||
|
||
# Check if it's binary or multiclass | ||
unique_labels = set(actual_results + predictions) | ||
if len(unique_labels) <= 2: | ||
f1 = float(f1_score(actual_results, predictions, average='binary')) | ||
else: | ||
f1 = float(f1_score(actual_results, predictions, average='weighted')) | ||
|
||
# Calculate confusion matrix | ||
cm = confusion_matrix(actual_results, predictions) | ||
|
||
return { | ||
'accuracy': accuracy, | ||
'f1_score': f1, | ||
'confusion_matrix': cm.tolist(), # Convert to list for JSON serialization | ||
'support': { | ||
'increase': int(np.sum(np.array(actual_results) == 1)), | ||
'decrease': int(np.sum(np.array(actual_results) == -1)) | ||
} | ||
} | ||
|
||
# Main execution | ||
if __name__ == "__main__": | ||
cd_cvm_list = get_distinct_cd_cvm() | ||
|
||
models_to_test = [ | ||
('openai/gpt-4-turbo', {}), | ||
('anthropic/claude-3.5-sonnet', {}) | ||
] | ||
|
||
results_df = run_predictions(cd_cvm_list, models_to_test) | ||
print(results_df.to_string()) | ||
|
||
# Calculate and print metrics | ||
predictions = results_df['DIRECTION'].tolist() | ||
actual_results = results_df['ACTUAL DIRECTION'].tolist() | ||
metrics = calculate_metrics(predictions, actual_results) | ||
print("\nMetrics:") | ||
print(json.dumps(metrics, indent=2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"folders": [ | ||
{ | ||
"path": "." | ||
} | ||
], | ||
"settings": {} | ||
} |
Oops, something went wrong.