lint

open-thought · Feb 3, 2025 · aff0fec · aff0fec
1 parent 9b1068e
commit aff0fec
Show file tree

Hide file tree

Showing 13 changed files with 304 additions and 316 deletions.
diff --git a/examples/word_ladder/README.md b/examples/word_ladder/README.md
@@ -6,21 +6,21 @@ This project generates a dataset of word ladder puzzles and (optionally) submits
 
 The project consists of several key components:
 
-- **`main.py`**:  
+- **`main.py`**:
   Orchestrates the overall flow. It performs the following tasks:
   1. Generates a dataset of word ladder puzzles by calling functions from `utils/create_word_ladders.py`.
   2. (Optionally) Triggers the reasoning request process to augment puzzles with chain-of-thought reasoning via `utils/generate_reasoning.py`.
   3. (Planned) Additional steps such as checking results or uploading the final dataset.
-  
+
   The configuration for the dataset parameters (e.g., word length, chain length, and dataset size) is centralized here, making it easy to adjust the settings as needed.
 
-- **`utils/create_word_ladders.py`**:  
+- **`utils/create_word_ladders.py`**:
   Contains functions to create and validate a word ladder dataset. It leverages underlying modules (e.g., `reasoning_gym`) to generate individual puzzles and ensures uniqueness across the dataset.
 
-- **`utils/generate_reasoning.py`**:  
+- **`utils/generate_reasoning.py`**:
   Reads the generated dataset (in JSONL format), then filters out puzzles that already have reasoning. For puzzles missing chain-of-thought data, it splits them into batches (with a default batch size that you can adjust) and submits each batch to Anthropic's Message Batches API. Each API request includes the puzzle along with a custom system prompt (read from `system_prompt.txt`), and the resulting metadata is stored for later retrieval and analysis.
 
-- **`usage_stats.py`**:  
+- **`usage_stats.py`**:
   Analyzes API response files to compute detailed usage statistics. This script:
   - Extracts token usage metrics such as `input_tokens`, `cache_creation_input_tokens`, `cache_read_input_tokens`, and `output_tokens`.
   - Calculates costs based on pricing data and shows the savings achieved through prompt caching.
@@ -29,23 +29,23 @@ The project consists of several key components:
 
 ## Warning
 
-**Caution:**  
+**Caution:**
 Running large batches of requests via the Anthropic API (especially in `generate_reasoning.py`) can incur significant costs in Anthropic credits. **Please review and understand your API quota and budgeting before running the API call.** If you are just testing or working with a demo dataset, ensure you adjust the batch size or dataset size appropriately to avoid unexpected charges.
 
 ## Prerequisites
 
 - **Python Version:** Python 3.7+
-- **Dependencies:**  
+- **Dependencies:**
   - `tqdm`
   - `anthropic`
   - `reasoning_gym`
-- **Environment Variables:**  
+- **Environment Variables:**
   For generating reasoning batches, set your Anthropic API key:
   ```bash
   export ANTROPIC_API_KEY=your_api_key_here
   ```
 
-## Directory Structure 
+## Directory Structure
 
 ```
 examples/word_ladder/
@@ -62,15 +62,15 @@ examples/word_ladder/
 
 The dataset generation parameters are centralized in `main.py` under the `config` dictionary. You can adjust settings like:
 
-- **Word Length:**  
+- **Word Length:**
   - `min_word_length`
   - `max_word_length`
-  
-- **Chain Length:**  
+
+- **Chain Length:**
   - `min_chain_length` (e.g., set to -1 for the shortest possible chain)
   - `max_chain_length`
-  
-- **Dataset Size:**  
+
+- **Dataset Size:**
   - `size` — the number of puzzles to generate (e.g., `1000` for a demo)
 
 ## How to Run
@@ -114,19 +114,18 @@ The dataset generation parameters are centralized in `main.py` under the `config
 
 ## Troubleshooting
 
-- **File Paths:**  
+- **File Paths:**
   Verify that `system_prompt.txt` is in the `/examples/word_ladder` folder as expected. The modules use paths relative to their location.
-  
-- **Environment Variables:**  
+
+- **Environment Variables:**
   Make sure your `ANTHROPIC_API_KEY` is set correctly when submitting API requests.
-  
-- **Output Directory Permissions:**  
+
+- **Output Directory Permissions:**
   Ensure the `output` directory exists and is writable by your user.
-  
-- **Cost Monitoring:**  
+
+- **Cost Monitoring:**
   Check your Anthropic API usage and account balance before running large batches to avoid unexpected costs.
 
 ## License
 
 This project is licensed under the MIT License.
-
diff --git a/examples/word_ladder/main.py b/examples/word_ladder/main.py
@@ -6,26 +6,26 @@
 3. Upload the final dataset to HuggingFace Hub (if needed)
 """
 
-import uuid
 import sys
+import uuid
 from pathlib import Path
-from typing import Dict, Any
+from typing import Any, Dict
 
 from examples.word_ladder.utils import create_word_ladders, generate_reasoning
 
 
 def create_dataset(jsonl_path: Path, config: Dict[str, Any]) -> bool:
     """
     Creates the word ladder dataset, handling potential exhaustion gracefully.
-    
+
     Returns:
         bool: True if dataset was created (even if truncated), False if creation failed
     """
     try:
         print("Step 1: Algorithmically creating word ladder chains...")
         create_word_ladders.create_word_ladder_dataset(str(jsonl_path), config=config)
         return True
-        
+
     except IndexError as e:
         # Dataset was exhausted but some examples were generated
         print("\nNote: Dataset generation stopped early due to exhaustion of unique puzzles.")
@@ -34,23 +34,24 @@ def create_dataset(jsonl_path: Path, config: Dict[str, Any]) -> bool:
             print("Continuing with the partial dataset that was successfully generated.")
             return True
         return False
-        
+
     except Exception as e:
         # Unexpected error during dataset creation
         print(f"\nError: Failed to create dataset: {str(e)}")
         return False
 
+
 def main():
     # Centralized configuration for the dataset
     config = {
-        'dataset_name': 'word_ladder',
-        'dataset_config': {
-            'min_word_length': 3,
-            'max_word_length': 5,
-            'min_chain_length':-1,  # set to -1 for the shortest possible path
-            'max_chain_length':10,
-            'size': 100,  # Generate a small-ish dataset for demonstration
-        }
+        "dataset_name": "word_ladder",
+        "dataset_config": {
+            "min_word_length": 3,
+            "max_word_length": 3,
+            "min_chain_length": -1,  # set to -1 for the shortest possible path
+            "max_chain_length": 7,
+            "size": 2000,  # Generate a small-ish dataset for demonstration
+        },
     }
 
     # Generate a friendly unique identifier and compose the file path
@@ -64,21 +65,20 @@ def main():
         print("Exiting due to dataset creation failure.")
         sys.exit(1)
 
-
     # Step 2: Generate reasoning
-    '''
+
     try:
         print("\nStep 2: Submitting reasoning batches for the dataset...")
         generate_reasoning.submit_reasoning_batches(input_path=str(jsonl_path))
     except Exception as e:
         print(f"\nError: Failed to submit reasoning batches: {str(e)}")
         sys.exit(1)
-    '''
 
     # Step 3: Check Anthropic batch results
     # Step 4: Upload to HuggingFace 🤗
-    
+
     print("\nComplete!")
 
+
 if __name__ == "__main__":
-    main() 
+    main()