Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update #8

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,6 @@ config.yaml

# remove output files that aren't the first one
outputs/output_*


wallet/*
26 changes: 22 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,27 @@ After completion, you should have following 2 things in your ~/.oci directory:

Then, we're going to configure a new file, called `config.yaml` that contains this structure, which will allow you to authenticate to OCI and call the OCI GenAI summarization model, to summarize the content from each project's README files:

```yml
compartment_id: "ocid1.compartment.oc1..ocid"
config_profile: "profile_name_in_your_oci_config"
1. Copy `config_example.yaml` to `config.yaml`:

```bash
cp config_example.yaml config.yaml
```

2. Update the following values in your `config.yaml`:

- `compartment_id`: Your OCI compartment OCID
- `config_profile`: Your OCI CLI profile name (usually "DEFAULT")
- `db_username`: Your database username (default is "ADMIN")
- `db_password`: Your database user's password
- `db_dsn`: Your database connection string, which includes:
- host: Your database hostname (e.g., "adb.us-ashburn-1.oraclecloud.com")
- service_name: Your database service name
- port: Database port (usually 1522)

3. This is an example database connection string format:

```yaml
db_dsn: "(description= (retry_count=5)(retry_delay=2)(address=(protocol=tcps)(port=1522)(host=adb.region.oraclecloud.com))(connect_data=(service_name=your_db_name_high.adb.oraclecloud.com))(security=(ssl_server_dn_match=yes)))"
```

> **Note**: You can find your oci configuration in `~/.oci/config`. Make sure you have previously installed [OCI SDK in your computer](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/sdkconfig.htm).
Expand All @@ -81,7 +99,7 @@ chmod a+x run.sh # if you don't have exec permissions initially for the .sh file
```sh
scrapy runspider trending_spider.py # this will get trending repositories
scrapy runspider info_spider.py # then, for each trending repository, it will extract info.
python main.py # to process their README.md files as well, and runs a summarizer on top of it.
python main.py # to process their README.md files as well, and runs a summarizer on top of it, and insert these into an autonomous database.
```

## Appendix: Getting Started with LinkedIn Poster
Expand Down
14 changes: 14 additions & 0 deletions config_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# OCI Configuration
compartment_id: "ocid1.compartment.oc1..example" # Your OCI compartment OCID
config_profile: "DEFAULT" # Your OCI CLI profile name, usually "DEFAULT"

# Database Configuration
db_username: "ADMIN" # Default admin username for Autonomous Database
db_password: "MySecurePass123!" # Replace with your actual database password
db_wallet_location: "./wallet" # Directory containing the wallet files
db_wallet_password: "wallet_password" # Default wallet password, change if you modified it

# Database Connection String (DSN)
# Use the TNS name from tnsnames.ora in your wallet directory
# Example: dbname_high, dbname_low, or dbname_medium
db_dsn: "dbname_high" # Replace with your database's TNS name
112 changes: 112 additions & 0 deletions db_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import oracledb
from datetime import datetime
import os

class DatabaseHandler:
def __init__(self, username, password, dsn, wallet_location, wallet_password):
self.username = username
self.password = password
self.dsn = dsn
self.wallet_location = wallet_location
self.wallet_password = wallet_password
self.connection = None

def connect(self):
try:
# Initialize Oracle Client library for thick mode with specific config directory
oracledb.init_oracle_client(config_dir=self.wallet_location, lib_dir=os.getenv("ORACLE_CLIENT_PATH", "C:\\oracle\\instantclient"))

# Configure the wallet location
self.connection = oracledb.connect(
user=self.username,
password=self.password,
dsn=self.dsn,
wallet_location=self.wallet_location,
wallet_password=self.wallet_password
)
print("Successfully connected to Oracle Database using thick mode")
self._create_table()
except Exception as e:
print(f"Error connecting to database: {str(e)}")
raise

def _table_exists(self, table_name):
check_sql = """
SELECT COUNT(*)
FROM user_tables
WHERE table_name = :1
"""
with self.connection.cursor() as cursor:
cursor.execute(check_sql, [table_name.upper()])
count = cursor.fetchone()[0]
return count > 0

def _create_table(self):
if not self._table_exists('repository_summaries'):
create_table_sql = """
CREATE TABLE repository_summaries (
id NUMBER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
summary_text CLOB,
created_date DATE,
daily_position NUMBER,
file_path VARCHAR2(255)
)
"""
try:
with self.connection.cursor() as cursor:
cursor.execute(create_table_sql)
self.connection.commit()
print("Table repository_summaries created successfully")
except Exception as e:
print(f"Error creating table: {str(e)}")
raise

def insert_summary(self, summary_text: str, daily_position: int, file_path: str):
print(f"\nInserting summary into database:")
print(f"- Position: {daily_position}")
print(f"- File: {file_path}")
print(f"- Summary length: {len(summary_text)} characters")

insert_sql = """
INSERT INTO repository_summaries
(summary_text, created_date, daily_position, file_path)
VALUES (:1, :2, :3, :4)
"""
try:
current_time = datetime.now()
with self.connection.cursor() as cursor:
cursor.execute(insert_sql, [
summary_text,
current_time,
daily_position,
file_path
])
self.connection.commit()

# Verify the insertion by fetching the latest record
verify_sql = """
SELECT id, daily_position, created_date, file_path
FROM repository_summaries
WHERE daily_position = :1
AND created_date = :2
"""
cursor.execute(verify_sql, [daily_position, current_time])
result = cursor.fetchone()

if result:
print(f"✓ Successfully inserted summary:")
print(f" - Database ID: {result[0]}")
print(f" - Position: {result[1]}")
print(f" - Timestamp: {result[2]}")
print(f" - File: {result[3]}")
else:
print("! Warning: Insertion succeeded but verification failed")

except Exception as e:
print(f"✗ Error inserting summary: {str(e)}")
raise

def close(self):
if self.connection:
self.connection.close()
print("\nDatabase connection closed successfully")
82 changes: 50 additions & 32 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import json
from nltk.tokenize import word_tokenize
import re
import yaml
from db_handler import DatabaseHandler

# python scripts
from readme_reader import main as run_readme_reader
Expand All @@ -16,52 +18,68 @@ def count_tokens(data: str = "") -> None:
print('*******{}*******'.format(len(word_tokenize(data))))
return len(word_tokenize(data))



def preprocess_string(data: str) -> None:
#processed_str = data.decode('utf8').encode('ascii', errors='ignore')
processed_str = re.sub(r'[^\x00-\x7f]',r'', data)
processed_str = re.sub(r'<.*?>',r'', data)
encoded_str = processed_str.encode("ascii", "ignore")
string_decode = encoded_str.decode()
return string_decode

#
#The listed limit is 128K for input + output..
#https://docs.oracle.com/en-us/iaas/Content/generative-ai/limitations.htm

def main():
readme_list = run_readme_reader()
print('Obtained {} README records'.format(len(readme_list)))
# Load database configuration
with open('config.yaml', 'r') as file:
config = yaml.safe_load(file)

iterator = 1
for x in readme_list:
new_text = preprocess_string(x)
# Initialize database connection
db = DatabaseHandler(
username=config['db_username'],
password=config['db_password'],
dsn=config['db_dsn'],
wallet_location=config['db_wallet_location'],
wallet_password=config['db_wallet_password']
)
db.connect()

'''with open('local_file.txt', 'w') as file:
file.write(new_text)
file.close()'''
try:
readme_list = run_readme_reader()
print('Obtained {} README records'.format(len(readme_list)))

if (len(new_text)) < 250:
print('Skipping iteration as it does not have enough data to summarize')
iterator += 1
continue
iterator = 1
for x in readme_list:
new_text = preprocess_string(x)

print('Text length: {}'.format(len(x)))
print('Text length: {}'.format(len(new_text)))
if len(new_text) > 10000:
new_text = new_text[0:10000]
else: new_text = new_text
print('Text length: {}'.format(len(new_text)))
summary = run_summarizer(new_text)
print(summary)
if (len(new_text)) < 250:
print('Skipping iteration as it does not have enough data to summarize')
iterator += 1
continue

with open('outputs/output_{}.txt'.format(iterator), 'w', encoding='utf-8') as file:
file.write(summary)
file.close()

iterator += 1
print('Text length: {}'.format(len(x)))
print('Text length: {}'.format(len(new_text)))
if len(new_text) > 10000:
new_text = new_text[0:10000]
else:
new_text = new_text
print('Text length: {}'.format(len(new_text)))

summary = run_summarizer(new_text)
print(summary)

# Save to file
output_file = f'outputs/output_{iterator}.txt'
with open(output_file, 'w', encoding='utf-8') as file:
file.write(summary)

# Save to database
db.insert_summary(
summary_text=summary,
daily_position=iterator,
file_path=output_file
)

iterator += 1

finally:
db.close()

if __name__ == '__main__':
main()
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
scrapy
PyGithub
oci
nltk
nltk
oracledb
4 changes: 2 additions & 2 deletions summarize_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def main(summary_txt: str = "") -> None:

# cohere.command-r-plus: ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceya7ozidbukxwtun4ocm4ngco2jukoaht5mygpgr6gq2lgq
# cohere.command for generation: ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceyafhwal37hxwylnpbcncidimbwteff4xha77n5xz4m7p6a
# new model - llama3: ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceyaycmwwnvu2gaqrffquofgmshlqzcdwpk727n4cykg34oa
chat_detail.serving_mode = oci.generative_ai_inference.models.OnDemandServingMode(model_id="ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceyaycmwwnvu2gaqrffquofgmshlqzcdwpk727n4cykg34oa")
# new model - llama3: ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceyaiir6nnhmlgwvh37dr2mvragxzszqmz3hok52pcgmpqta
chat_detail.serving_mode = oci.generative_ai_inference.models.OnDemandServingMode(model_id="ocid1.generativeaimodel.oc1.us-chicago-1.amaaaaaask7dceyaiir6nnhmlgwvh37dr2mvragxzszqmz3hok52pcgmpqta")
chat_detail.chat_request = llm_inference_request
chat_detail.compartment_id = compartment_id
chat_response = generative_ai_inference_client.chat(chat_detail)
Expand Down