bigcode-project · terryyz · May 10, 2024 · May 10, 2024 · May 10, 2024 · May 10, 2024
diff --git a/RECORD.md b/RECORD.md
@@ -31,6 +31,7 @@
  - Wen-Ding Li
  - Ming Xu
  - Zhihan Zhang
+ - Indraneil Paul
 
 ## Round 2: Manual Code Refinement
  - Terry Yue Zhuo
@@ -45,6 +46,7 @@
  - Dmitry Abulkhanov
  - Wen-Ding Li
  - Wenhao Yu
+ - Indraneil Paul
 
 ## Round 3: Data Quality Check
  - Terry Yue Zhuo

diff --git a/data/clean/f_227_indraneil.py b/data/clean/f_227_indraneil.py
@@ -0,0 +1,90 @@
+import random
+import seaborn as sns
+import numpy as np
+from matplotlib import pyplot as plt
+
+def f_227(length, range_limit=100, seed=0):
+    """
+    Create a list of random numbers, sort them and record the distribution of the numbers in a histogram using 
+    default settings in a deterministic seaborn plot. Return the axes object and the list of random numbers.
+
+    Parameters:
+    length (int): The length of the list of random numbers.
+    range_limit (int, Optional): The range of the random numbers. Defaults to 100. Must be greater than 1.
+    seed (int, Optional): The seed value for the random number generator. Defaults to 0.
+
+    Returns:
+    Tuple[matplotlib.axes._subplots.AxesSubplot, List[int]]: The axes object with the plot and the list of random numbers.
+
+    Requirements:
+    - random
+    - matplotlib.pyplot
+    - seaborn
+    - numpy
+
+    Raises:
+    ValueError: If range_limit is less than or equal to 1.
+
+    Example:
+    >>> import matplotlib.pyplot as plt
+    >>> ax, data = f_227(1000, 100, 24) # Generate a list of 1000 random numbers between 1 and 100
+    >>> isinstance(ax, plt.Axes)
+    True
+    """
+    if range_limit <= 1:
+        raise ValueError("range_limit must be greater than 1")
+
+    random.seed(seed)
+    np.random.seed(seed)
+
+    random_numbers = [random.randint(1, range_limit) for _ in range(length)]
+    random_numbers.sort()
+
+    # Initialize a fresh plot
+    plt.figure()
+    plot = sns.histplot(random_numbers, kde=False)
+
+    return plot.axes, random_numbers
+
+
+import unittest
+import doctest
+
+
+class TestCases(unittest.TestCase):
+
+    def test_case_1(self):
+        _, data = f_227(1000)
+        self.assertEqual(len(data), 1000)
+
+    def test_case_2(self):
+        with self.assertRaises(ValueError):
+            _, data = f_227(1000, -3, 42)
+
+
+    def test_case_3(self):
+        _, data = f_227(20, 75, 77)
+        self.assertEqual(data, [1, 4, 15, 19, 23, 25, 25, 26, 31, 31, 33, 36, 38, 42, 61, 64, 65, 65, 72, 72])
+        self.assertTrue(all(1 <= num <= 75 for num in data))
+
+    def test_case_4(self):
+        ax, data = f_227(1000, 75)
+        target = np.array([98, 103, 106, 73, 87, 92, 94, 84, 90, 95, 78])
+        self.assertTrue((ax.containers[0].datavalues == target).all()) 
+
+    def test_case_5(self):
+        _, data1 = f_227(1000, seed=42)
+        _, data2 = f_227(1000, seed=42)
+        self.assertEqual(data1, data2)
+
+
+def run_tests():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCases))
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
+
+
+if __name__ == "__main__":
+    doctest.testmod()
+    run_tests()
diff --git a/data/clean/f_228_indraneil.py b/data/clean/f_228_indraneil.py
@@ -0,0 +1,90 @@
+import heapq
+import math
+import matplotlib.pyplot as plt
+
+
+def f_228(l1, l2, N=10):
+    """ 
+    Find the N biggest differences between the respective elements of the list 'l1' and list 'l2', 
+    square the differences, take the square root and return the plotted values as a matplotlib Axes object.
+
+    Parameters:
+    l1 (list): A list of numbers.
+    l2 (list): A list of numbers.
+    N (int): Number of largest differences to consider. Default is 10.
+
+    Returns:
+    matplotlib.axes._subplots.AxesSubplot: A matplotlib Axes object with the plotted differences.
+
+    Requirements:
+    - heapq
+    - math
+    - matplotlib.pyplot
+
+    Example:
+    >>> l1 = [99, 86, 90, 70, 86, 95, 56, 98, 80, 81]
+    >>> l2 = [21, 11, 21, 1, 26, 40, 4, 50, 34, 37]
+    >>> ax = f_228(l1, l2)
+    >>> type(ax)
+    <class 'matplotlib.axes._axes.Axes'>
+    """
+    largest_diff_indices = heapq.nlargest(N, range(len(l1)), key=lambda i: abs(l1[i] - l2[i]))
+    largest_diffs = [math.sqrt((l1[i] - l2[i])**2) for i in largest_diff_indices]
+
+    fig, ax = plt.subplots()
+    ax.plot(largest_diffs)
+
+    return ax
+
+
+import unittest
+import doctest
+
+
+class TestCases(unittest.TestCase):
+    def test_case_1(self):
+        l1 = [99, 86, 90, 70, 86, 95, 56, 98, 80, 81]
+        l2 = [21, 11, 21, 1, 26, 40, 4, 50, 34, 37]
+        ax = f_228(l1, l2)
+        self.assertIsInstance(ax, plt.Axes)
+        self.assertEqual(len(ax.lines[0].get_ydata()), 10)
+
+    def test_case_2(self):
+        l1 = [10, 20, 30, 40, 50]
+        l2 = [1, 2, 3, 4, 5]
+        ax = f_228(l1, l2, 3)
+        self.assertIsInstance(ax, plt.Axes)
+        self.assertEqual(len(ax.lines[0].get_ydata()), 3)
+
+    def test_case_3(self):
+        l1 = [0, 10, 20, 30, 40, 50]
+        l2 = [0, 0, 0, 0, 0, 0]
+        ax = f_228(l1, l2)
+        self.assertIsInstance(ax, plt.Axes)
+        self.assertEqual(len(ax.lines[0].get_ydata()), 6)
+
+    def test_case_4(self):
+        l1 = [1, 2, 3, 4, 5]
+        l2 = [5, 4, 3, 2, 1]
+        ax = f_228(l1, l2)
+        self.assertIsInstance(ax, plt.Axes)
+        self.assertEqual(len(ax.lines[0].get_ydata()), 5)
+
+    def test_case_5(self):
+        l1 = [0, 0, 0, 0, 0]
+        l2 = [0, 0, 0, 0, 0]
+        ax = f_228(l1, l2)
+        self.assertIsInstance(ax, plt.Axes)
+        self.assertEqual(len(ax.lines[0].get_ydata()), 5)
+
+
+def run_tests():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCases))
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
+
+
+if __name__ == "__main__":
+    doctest.testmod()
+    run_tests()
diff --git a/data/clean/f_229_indraneil.py b/data/clean/f_229_indraneil.py
@@ -0,0 +1,90 @@
+import re
+import json
+from collections import Counter
+
+
+def f_229(json_str, top_n=10):
+    """
+    Extract all URLs from a string-serialized JSON dict using a specific URL pattern and return a dict
+    with the URLs as keys and the number of times they appear as values.
+
+    Parameters:
+    json_str (str): The JSON string.
+    top_n (int, Optional): The number of URLs to return. Defaults to 10. 
+
+    Returns:
+    dict: A dict with URLs as keys and the number of times they appear as values.
+
+    Requirements:
+    - re
+    - json
+    - collections.Counter
+
+    Example:
+    >>> f_229('{"name": "John", "website": "https://www.example.com"}')
+    {'https://www.example.com': 1}
+    """
+    pattern = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
+    data = json.loads(json_str)
+    urls = []
+
+    def extract(dictionary):
+        for key, value in dictionary.items():
+            if isinstance(value, dict):
+                extract(value)
+            elif isinstance(value, str) and re.match(pattern, value):
+                urls.append(value)
+
+    extract(data)
+    if not urls:
+        return {}
+    elif len(urls) <= top_n:
+        return dict(Counter(urls))
+
+    return dict(Counter(urls).most_common(top_n))
+
+
+import unittest
+import doctest
+
+
+class TestCases(unittest.TestCase):
+
+    def test_case_1(self):
+        json_str = '{"name": "John", "website": "qwerthttps://www.example.com"}'
+        result = f_229(json_str)
+        self.assertEqual(result, {})
+
+    def test_case_2(self):
+        json_str = '{"name": "John", "social": {"twitter": "https://twitter.com/john", "linkedin": "https://linkedin.com/in/john"}, "website": "https://linkedin.com/in/john"}'
+        result = f_229(json_str)
+        self.assertEqual(result, {'https://twitter.com/john': 1, 'https://linkedin.com/in/john': 2})
+        result = f_229(json_str, 1)
+        self.assertEqual(result, {'https://linkedin.com/in/john': 2})
+
+    def test_case_3(self):
+        json_str = 'This is an adversarial input 0061'
+        with self.assertRaises(json.decoder.JSONDecodeError):
+            result = f_229(json_str)
+
+    def test_case_4(self):
+        json_str = '{"name": "John", "age": 30}'
+        result = f_229(json_str)
+        self.assertEqual(result, {})
+
+    def test_case_5(self):
+        json_str = '{"name": "John", "website": "example.com", "blog": "www.johnblog.com"}'
+        result = f_229(json_str)
+        self.assertEqual(result, {'www.johnblog.com': 1})
+
+
+def run_tests():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCases))
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
+
+
+if __name__ == "__main__":
+    doctest.testmod()
+    run_tests()
diff --git a/data/clean/f_230_indraneil.py b/data/clean/f_230_indraneil.py
@@ -0,0 +1,105 @@
+import numpy as np
+from collections import Counter
+import matplotlib.pyplot as plt
+
+def f_230(L):
+    """
+    Analyze an "L" list by calculating the mean, median, mode, and standard deviation.
+    Visualize the data by returning a histogram plot.
+
+    Parameters:
+    L (list): Input list.
+
+    Returns:
+    dict: A dictionary with the mean, median, mode, and standard deviation of 'L'.
+
+    Requirements:
+    - numpy
+    - collections.Counter
+    - matplotlib.pyplot
+
+    Example:
+    >>> L = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    >>> stats = f_230(L)
+    >>> print(stats["mean"])
+    5.0
+    >>> print(stats["median"])
+    5.0
+    >>> print(stats["mode"])
+    1
+    """
+    mean = np.mean(L)
+    median = np.median(L)
+    mode = Counter(L).most_common(1)[0][0]
+    std_dev = np.std(L)
+
+    plt.hist(L, bins='auto')
+    plt.title('Histogram of Data')
+    plt.xlabel('Value')
+    plt.ylabel('Frequency')
+
+    return {'mean': mean, 'median': median, 'mode': mode, 'std_dev': std_dev, 'plot': plt.gca()}
+
+
+import unittest
+import doctest
+
+
+class TestCases(unittest.TestCase):
+
+    def test_case_1(self):
+        L = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+        stats = f_230(L)
+        self.assertAlmostEqual(stats['mean'], np.mean(L))
+        self.assertAlmostEqual(stats['median'], np.median(L))
+        self.assertEqual(stats['mode'], 1)
+        self.assertAlmostEqual(stats['std_dev'], np.std(L))
+        self.assertIsInstance(stats['plot'], plt.Axes)
+
+    def test_case_2(self):
+        L = [5, 5, 5, 5, 5]
+        stats = f_230(L)
+        self.assertAlmostEqual(stats['mean'], 5.0)
+        self.assertAlmostEqual(stats['median'], 5.0)
+        self.assertEqual(stats['mode'], 5)
+        self.assertAlmostEqual(stats['std_dev'], 0.0)
+        self.assertIsInstance(stats['plot'], plt.Axes)
+
+    def test_case_3(self):
+        L = [1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 8, 9]
+        stats = f_230(L)
+        self.assertAlmostEqual(stats['mean'], np.mean(L))
+        self.assertAlmostEqual(stats['median'], np.median(L))
+        self.assertEqual(stats['mode'], 8)
+        self.assertAlmostEqual(stats['std_dev'], np.std(L))
+        self.assertIsInstance(stats['plot'], plt.Axes)
+
+    def test_case_4(self):
+        L = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
+        stats = f_230(L)
+        self.assertAlmostEqual(stats['mean'], np.mean(L))
+        self.assertAlmostEqual(stats['median'], np.median(L))
+        self.assertEqual(stats['mode'], 10)
+        self.assertAlmostEqual(stats['std_dev'], np.std(L))
+        self.assertIsInstance(stats['plot'], plt.Axes)
+
+    def test_case_5(self):
+        L = [5]
+        stats = f_230(L)
+        self.assertAlmostEqual(stats['mean'], 5.0)
+        self.assertAlmostEqual(stats['median'], 5.0)
+        self.assertEqual(stats['mode'], 5)
+        self.assertAlmostEqual(stats['std_dev'], 0.0)
+        self.assertIsInstance(stats['plot'], plt.Axes)
+
+
+def run_tests():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCases))
+    runner = unittest.TextTestRunner()
+    runner.run(suite)
+
+
+if __name__ == "__main__":
+    doctest.testmod()
+    run_tests()