-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpodcast_backend.py
188 lines (163 loc) · 7.66 KB
/
podcast_backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import modal
def download_whisper():
# Load the Whisper model
import os
import whisper
print ("Download the Whisper model")
# Perform download only once and save to Container storage
whisper._download(whisper._MODELS["medium"], '/content/podcast/', False)
stub = modal.Stub("corise-podcast-project")
corise_image = modal.Image.debian_slim().pip_install("feedparser",
"https://github.com/openai/whisper/archive/9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d.tar.gz",
"requests",
"ffmpeg",
"openai",
"tiktoken",
"wikipedia",
"ffmpeg-python").apt_install("ffmpeg").run_function(download_whisper)
@stub.function(image=corise_image, gpu="any", timeout=600)
def get_transcribe_podcast(rss_url, local_path):
print ("Starting Podcast Transcription Function")
print ("Feed URL: ", rss_url)
print ("Local Path:", local_path)
# Read from the RSS Feed URL
import feedparser
intelligence_feed = feedparser.parse(rss_url)
podcast_title = intelligence_feed['feed']['title']
episode_title = intelligence_feed.entries[0]['title']
episode_image = intelligence_feed['feed']['image'].href
for item in intelligence_feed.entries[0].links:
if (item['type'] == 'audio/mpeg'):
episode_url = item.href
episode_name = "podcast_episode.mp3"
print ("RSS URL read and episode URL: ", episode_url)
# Download the podcast episode by parsing the RSS feed
from pathlib import Path
p = Path(local_path)
p.mkdir(exist_ok=True)
print ("Downloading the podcast episode")
import requests
with requests.get(episode_url, stream=True) as r:
r.raise_for_status()
episode_path = p.joinpath(episode_name)
with open(episode_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print ("Podcast Episode downloaded")
# Load the Whisper model
import os
import whisper
# Load model from saved location
print ("Load the Whisper model")
model = whisper.load_model('medium', device='cuda', download_root='/content/podcast/')
# Perform the transcription
print ("Starting podcast transcription")
result = model.transcribe(local_path + episode_name)
# Return the transcribed text
print ("Podcast transcription completed, returning results...")
output = {}
output['podcast_title'] = podcast_title
output['episode_title'] = episode_title
output['episode_image'] = episode_image
output['episode_transcript'] = result['text']
return output
@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_summary(podcast_transcript):
import openai
instructPrompt = """
Summarize the following text on insights from the corporate world. The summary should have a maximum limit of 250 words.
"""
request = instructPrompt + podcast_transcript
chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
messages=[{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": request}
]
)
podcastSummary = chatOutput.choices[0].message.content
return podcastSummary
@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_guest(podcast_transcript):
import openai
import wikipedia
import json
request = podcast_transcript[:10000]
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": request}],
functions=[
{
"name": "get_podcast_guest_information",
"description": "Get information on the podcast host and their guest (if any) using their name to search on Wikipedia",
"parameters": {
"type": "object",
"properties": {
"guest_name": {
"type": "string",
"description": "The names of the host and the guest (if any)",
},
"unit": {"type": "string"},
},
"required": ["guest_name"],
},
}
],
function_call={"name": "get_podcast_guest_information"}
)
podcastGuest={}
podcast_guest = ""
response_message = completion["choices"][0]["message"]
if response_message.get("function_call"):
function_name = response_message["function_call"]["name"]
function_args = json.loads(response_message["function_call"]["arguments"])
podcast_guest=function_args.get("guest_name")
podcastGuest['name']=function_args.get("guest_name")
try:
input = wikipedia.page(podcast_guest, auto_suggest=False)
podcastGuest['summary']=input.summary= input.summary
except:
try:
input = wikipedia.page(podcast_guest + " " + podcast_guest_org + " " + podcast_guest_title, auto_suggest=True)
podcastGuest['summary']=input.summary
except:
print('Guest not found on Wikipedia')
podcastGuest['summary']="No information available about the host"
return podcastGuest
@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_highlights(podcast_transcript):
import openai
instructPrompt = """
You are an expert in extracting the insights from a text. You are responsible for reviewing podcast transcripts to highlight the key insights from the podcast. Identify the five most significant insights in the podcast.
- Each insight should be a statement by the podcast host.
- Each insight must be impactful.
- Each highlight must be concise.
- The insight should make the listener want to listen to the entire podcast.
Present the insights as a bulleted list which includes the full sentence of the highlight and nothing else.The output starts as follows:
Key Insights:
"""
request = instructPrompt + podcast_transcript
chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
messages=[{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": request}
]
)
podcastHighlights = chatOutput.choices[0].message.content
return podcastHighlights
@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"), timeout=1200)
def process_podcast(url, path):
output = {}
podcast_details = get_transcribe_podcast.call(url, path)
podcast_summary = get_podcast_summary.call(podcast_details['episode_transcript'])
podcast_guest = get_podcast_guest.call(podcast_details['episode_transcript'])
podcast_highlights = get_podcast_highlights.call(podcast_details['episode_transcript'])
output['podcast_details'] = podcast_details
output['podcast_summary'] = podcast_summary
output['podcast_guest'] = podcast_guest
output['podcast_highlights'] = podcast_highlights
return output
@stub.local_entrypoint()
def test_method(url, path):
output = {}
podcast_details = get_transcribe_podcast.call(url, path)
print ("Podcast Summary: ", get_podcast_summary.call(podcast_details['episode_transcript']))
print ("Podcast Guest Information: ", get_podcast_guest.call(podcast_details['episode_transcript']))
print ("Podcast Highlights: ", get_podcast_highlights.call(podcast_details['episode_transcript']))