Fix #112 | v1.1.33 (#118)

Daethyra · Jan 17, 2024 · 8e58a91 · 8e58a91
2 parents 7350798 + 0b8d8c1
commit 8e58a91
Show file tree

Hide file tree

Showing 5 changed files with 105 additions and 43 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "Build-RAGAI"
-version = "1.1.32"
+version = "1.1.33"
 description = "Retrieval Augmented Generation code for AI. Prebuilt Python components for leveraging generative AI via Large Language Models."
 authors = [
     {name = "Daethyra", email = "[email protected]"},

diff --git a/src/langchain/notebooks/learn_rag.ipynb b/src/langchain/notebooks/learn_rag.ipynb
@@ -6,12 +6,18 @@
    "source": [
     "# Generate Answers with Your Documents\n",
     "\n",
+    "## Table of Contents\n",
+    "- [RAG by Directly Passing Context](#RAG-by-Directly-Passing-Context)\n",
+    "- [RAG by Similarity Search](#RAG-by-Similarity-Search)\n",
+    "\n",
     "This notebook show an example of Retrieval Augmented Generation that utilizes LangChain to perform question-answering tasks by combining retrieval and generation techniques.\n",
     "\n",
     "We will practice retrieval augmented generation once by passing the context straight to the language model, and once by using FAISS and similarity searching.\n",
     "\n",
     "[FAISS](https://faiss.ai/) is an efficient and high-performance library for vector similarity search. It allows us to find similar things in mountains of data, fast.\n",
     "\n",
+    "---\n",
+    "\n",
     "## What is Retrieval Augmented Generation?\n",
     "Retrieval Augmented Generation (RAG)'s purpose is to increase the relevance, accuracy and truthfulness of generation. This way we remove the \"data freshness\" problem that LLM's inherently have.\n",
     "\n",
@@ -262,7 +268,9 @@
     "\n",
     "• Use different language models by specifying the desired model name when creating the `ChatOpenAI` instance. You can explore different models provided by OpenAI, open-source models on the HuggingFace Hub, or use your own fine-tuned models.\n",
     "\n",
-    "• Customize the output parser to parse the generated answer in a format that suits your application's needs."
+    "• Customize the output parser to parse the generated answer in a format that suits your application's needs.\n",
+    "\n",
+    "---"
    ]
   },
   {

diff --git a/src/transformers/audiotranscription/mictranscription/notes.txt b/src/transformers/audiotranscription/mictranscription/notes.txt
diff --git a/src/transformers/audiotranscription/mictranscription/transcribe_microphone.py b/src/transformers/audiotranscription/mictranscription/transcribe_microphone.py
@@ -1,6 +1,6 @@
 """
 Long-Form Transcription
-The Whisper model is intrinsically designed to work on audio samples of up to 30s in duration. However, by using a chunking algorithm, it can be used to transcribe audio samples of up to arbitrary length. This is possible through Transformers pipeline method. Chunking is enabled by setting chunk_length_s=30 when instantiating the pipeline. With chunking enabled, the pipeline can be run with batched inference. It can also be extended to predict sequence level timestamps by passing return_timestamps=True:
+The Whisper model is intrinsically designed to work on audio samples of up to 30s in duration. However, by using a chunking algorithm, it can be used to transcribe audio samples of up to arbitrary length. This is possible through Transformers pipeline method. Chunking is enabled by setting chunk_length_s=30 when instantiating the pipeline. With chunking enabled, the pipeline can be run with batched inference. It can also be extended to predict sequence level timestamps by passing return_timestamps=True
 """
 
 import sys
@@ -45,18 +45,19 @@ class RealTimeASR:
         sample_rate (int): The sample rate for audio data (in Hz).
     """
 
-    def __init__(self, maxlen=300):
+    def __init__(self, maxlen=300, chunk_length_s=30):
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
         self.asr_pipeline = pipeline(
             "automatic-speech-recognition",
-            model="openai/whisper-large-v2",
-            chunk_length_s=30,
+            model="openai/whisper-large-v3",
+            chunk_length_s=chunk_length_s,
             device=self.device,
-            return_timestamps=True,
+            return_timestamps=False,
         )
         self.transcription_cache = deque(maxlen=maxlen)
         self.sliding_window = np.array([])
         self.sample_rate = 16000  # Sample rate for the audio stream
+        self.chunk_length_s = chunk_length_s
 
     def initialize_audio(self):
         """
@@ -93,19 +94,25 @@ def capture_and_transcribe(self, log_file=None):
             print(f"Error opening log file: {log_file}", file=sys.stderr, flush=True)
             log_file = None
 
+        # Continuously read audio data from the microphone
         while self.stream.is_active():
             try:
                 audio_data = np.frombuffer(self.stream.read(1024), dtype=np.int16)
                 self.sliding_window = np.concatenate((self.sliding_window, audio_data))
 
-                if len(self.sliding_window) >= self.sample_rate * 30:  # 30 seconds
+                # Perform transcription when the window reaches a certain length
+                if len(self.sliding_window) >= self.sample_rate * self.chunk_length_s:
                     transcription = self.transcribe_audio(
                         self.sliding_window[: self.sample_rate * 30]
                     )
+
+                    # Add the transcription to the cache
                     self.handle_transcription(transcription, log_file)
                     shift_size = min(
                         self.sample_rate * 5, len(self.sliding_window) // 2
                     )  # Ensure shift size is not too large
+
+                    # Shift the sliding window
                     self.sliding_window = self.sliding_window[
                         shift_size:
                     ]  # Shift by 5 seconds or less
@@ -134,6 +141,14 @@ def transcribe_audio(self, audio):
             return {}
 
     def handle_transcription(self, transcription, log_file):
+        """
+        Handle the transcription by appending the text to the transcription cache and printing it to the standard output.
+
+        :param transcription: A dictionary containing the transcription data.
+        :type transcription: dict
+        :param log_file: A file to write the transcription text to, if provided.
+        :type log_file: str
+        """
         if (
             "text" in transcription
             and len(self.transcription_cache) < self.transcription_cache.maxlen
@@ -160,6 +175,16 @@ def is_log_file_writable(self, log_file):
             return False
 
     def write_to_log(self, log_file, text):
+        """
+        Write text to a log file.
+
+        Parameters:
+            log_file (str): The path to the log file.
+            text (str): The text to write to the log file.
+
+        Returns:
+            None
+        """
         if os.path.getsize(log_file) > 1000000:  # If log file is larger than 1MB
             log_file = create_new_log_file(log_file)
             if not self.is_log_file_writable(log_file):
@@ -176,11 +201,29 @@ def write_to_log(self, log_file, text):
             print(f"Error writing to log file: {log_file}", file=sys.stderr, flush=True)
 
     def write_transcription_cache_to_log(self, log_file):
+        """
+        Write the transcription cache to a log file.
+
+        Parameters:
+            log_file (str): The path to the log file.
+
+        Returns:
+            None
+        """
         if log_file and self.transcription_cache:
             transcription = self.transcription_cache.popleft()
             self.write_to_log(log_file, transcription)
 
     def close_stream(self, log_file):
+        """
+        Closes the audio stream and stops the recording.
+
+        Parameters:
+            log_file (str): The path to the log file to which the transcription cache will be written.
+
+        Returns:
+            None
+        """
         if self.stream.is_active():
             self.stream.stop_stream()
             self.stream.close()

diff --git a/src/transformers/audiotranscription/mictranscription/transcription_log.txt b/src/transformers/audiotranscription/mictranscription/transcription_log.txt
@@ -1,8 +1,48 @@
- So, I'm going to go ahead and start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. Thanks for watching!
- So, I'm going to go ahead and start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. I'm going to start the video. Thanks for watching!
+ Hey, I'm just trying to do a test here and see if this application works or not. Takk for ating medietekst.
+ Hey, I'm just trying to do a test here and see if this application works or not. Takk for ating medietekst.
+ Hey, I'm just trying to do a test here and see if this application works or not. Can you hear me okay? Vindelsteg
+ Hey, I'm just trying to do a test here and see if this application works or not. Can you hear me okay? Vindelsteg
+ this application works or not can you hear me okay? Vindelsteg
+ this application works or not can you hear me okay? Vindelsteg
+ Can you hear me, Oga? Vindelbrot
+ Can you hear me, Oga? Vindelbrot
 INFO:root:Transcription stopped by user.
- I just want to test this, I don't know how to actually use Pi Audio in order to capture the fucking in order to capture the fucking microphone
- I just want to test this, I don't know how to actually use Pi Audio in order to capture the fucking in order to capture the fucking microphone
- actually use pi audio in order to capture the fuckin in order to capture the fuckin uh... microphone Wait a minute!
- actually use pi audio in order to capture the fuckin in order to capture the fuckin uh... microphone Wait a minute!
+ try and get this thing loaded. looks like it loaded pretty fast hopefully the change in sliding window and
+ try and get this thing loaded. looks like it loaded pretty fast hopefully the change in sliding window and
+INFO:root:Transcription stopped by user.
+INFO:root:Starting audio capture and transcription.
+INFO:root:Starting audio capture and transcription.
+INFO:root:Starting audio capture and transcription.
+INFO:root:Starting audio capture and transcription.
+INFO:root:Starting audio capture and transcription.
+ You Yeah Thanks for watching!
+ You Yeah Thanks for watching!
+INFO:root:Transcription stopped by user.
+ ok, special took has been added and now we are using the v3 model the worry is that it won't actually transcribe in real time ummm still not seeing anything back just yet would really like to
+ ok, special took has been added and now we are using the v3 model the worry is that it won't actually transcribe in real time ummm still not seeing anything back just yet would really like to
+ using the v3 model the worry is that it won't actually transcribe in real time still not seeing anything back just yet would really like to I would really like to.
+ using the v3 model the worry is that it won't actually transcribe in real time still not seeing anything back just yet would really like to I would really like to.
+INFO:root:Transcription stopped by user.
+ just doing a
+ just doing a
+ just doing a so I just gotta keep testing it
+ just doing a so I just gotta keep testing it
+INFO:root:Transcription stopped by user.
+ERROR:root:Error during transcription: 'charmap' codec can't encode characters in position 12-23: character maps to <undefined>
+Traceback (most recent call last):
+  File "C:\Users\dae\.vscode\Software\Build-RAGAI\src\transformers\audiotranscription\mictranscription\run.py", line 43, in main
+    asr_app.capture_and_transcribe(log_file=args.log_file)
+  File "C:\Users\dae\.vscode\Software\Build-RAGAI\src\transformers\audiotranscription\mictranscription\transcribe_microphone.py", line 125, in capture_and_transcribe
+    self.close_stream(log_file)
+  File "C:\Users\dae\.vscode\Software\Build-RAGAI\src\transformers\audiotranscription\mictranscription\transcribe_microphone.py", line 233, in close_stream
+    self.write_transcription_cache_to_log(log_file)
+  File "C:\Users\dae\.vscode\Software\Build-RAGAI\src\transformers\audiotranscription\mictranscription\transcribe_microphone.py", line 215, in write_transcription_cache_to_log
+    self.write_to_log(log_file, transcription)
+  File "C:\Users\dae\.vscode\Software\Build-RAGAI\src\transformers\audiotranscription\mictranscription\transcribe_microphone.py", line 199, in write_to_log
+    f.write(text + "\n")
+  File "C:\Users\dae\AppData\Local\Programs\Python\Python310\lib\encodings\cp1252.py", line 19, in encode
+    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
+UnicodeEncodeError: 'charmap' codec can't encode characters in position 12-23: character maps to <undefined>
+ that was a really quick initialization I hope that it runs smoothly because if it doesn't I don't know what to do straight up straight up will not know what to do KONKLOMEN
+ that was a really quick initialization I hope that it runs smoothly because if it doesn't I don't know what to do straight up straight up will not know what to do KONKLOMEN
 INFO:root:Transcription stopped by user.