From 4921c8941ce0d187e41f246a0cb8e8b8d45e0ae1 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 15 Apr 2024 06:27:23 +1000 Subject: [PATCH] update dwenhao data --- data/clean/f_746_wenhao.py | 4 +- data/clean/f_752_wenhao.py | 2 +- data/clean/f_761_wenhao.py | 3 +- data/clean/f_762_wenhao.py | 4 +- data/clean/f_765_wenhao.py | 5 +- data/clean/f_770_wenhao.py | 4 +- data/open-eval.jsonl | 626 +++++++++++++------------- data/processed/f_331_jenny_w_doc.py | 4 +- data/processed/f_336_jenny_w_doc.py | 2 +- data/processed/f_367_jenny_wo_doc.py | 2 +- data/processed/f_407_jenny_wo_doc.py | 2 +- data/processed/f_411_jenny_w_doc.py | 2 +- data/processed/f_413_jenny_wo_doc.py | 2 +- data/processed/f_423_jenny_wo_doc.py | 2 +- data/processed/f_746_wenhao_w_doc.py | 4 +- data/processed/f_750_wenhao_wo_doc.py | 2 +- data/processed/f_752_wenhao_w_doc.py | 2 +- data/processed/f_757_wenhao_w_doc.py | 2 +- data/processed/f_758_wenhao_w_doc.py | 2 +- data/processed/f_761_wenhao_w_doc.py | 3 +- data/processed/f_762_wenhao_w_doc.py | 4 +- data/processed/f_765_wenhao_w_doc.py | 5 +- data/processed/f_770_wenhao_w_doc.py | 4 +- data/processed/f_778_wenhao_w_doc.py | 2 +- data/processed/f_798_wenhao_wo_doc.py | 2 +- data/processed/f_810_wenhao_w_doc.py | 2 +- data/processed/f_811_wenhao_w_doc.py | 12 +- data/processed/f_820_wenhao_w_doc.py | 2 +- data/processed/f_827_wenhao_w_doc.py | 4 +- data/processed/f_830_wenhao_w_doc.py | 2 +- data/processed/f_836_chien_w_doc.py | 2 +- data/processed/f_857_chien_wo_doc.py | 2 +- data/processed/f_867_chien_w_doc.py | 2 +- data/processed/f_875_chien_w_doc.py | 2 +- data/processed/f_887_chien_w_doc.py | 2 +- data/processed/f_891_chien_wo_doc.py | 2 +- data/processed/f_895_chien_w_doc.py | 2 +- data/processed/f_898_chien_wo_doc.py | 4 +- data/processed/f_902_chien_w_doc.py | 2 +- data/processed/f_906_chien_w_doc.py | 12 +- data/processed/f_910_chien_w_doc.py | 2 +- data/processed/f_915_chien_w_doc.py | 2 +- data/processed/f_917_chien_wo_doc.py | 2 +- data/processed/f_923_chien_w_doc.py | 2 +- data/processed/f_925_chien_w_doc.py | 2 +- data/raw/f_746_wenhao.py | 4 +- data/raw/f_752_wenhao.py | 2 +- data/raw/f_761_wenhao.py | 3 +- data/raw/f_762_wenhao.py | 4 +- data/raw/f_765_wenhao.py | 5 +- data/raw/f_770_wenhao.py | 4 +- script/parse.py | 2 +- 52 files changed, 392 insertions(+), 392 deletions(-) diff --git a/data/clean/f_746_wenhao.py b/data/clean/f_746_wenhao.py index 6032121c..f4556362 100644 --- a/data/clean/f_746_wenhao.py +++ b/data/clean/f_746_wenhao.py @@ -20,11 +20,11 @@ def f_746(d, keys=['x', 'y', 'z']): >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}] >>> ax = f_746(data) >>> type(ax) - + >>> ax = f_746(data, keys=['x', 'y']) >>> type(ax) - + """ # Convert the list of dictionaries to a DataFrame df = pd.DataFrame(d) diff --git a/data/clean/f_752_wenhao.py b/data/clean/f_752_wenhao.py index bf5f2a4b..4fee97f4 100644 --- a/data/clean/f_752_wenhao.py +++ b/data/clean/f_752_wenhao.py @@ -31,7 +31,7 @@ def f_752(letters, repetitions, colors): Example: >>> ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue']) >>> type(ax) - + """ if len(letters) != len(repetitions) or len(letters) != len(colors) or len(letters) == 0: raise ValueError("All lists must be the same length and non-empty.") diff --git a/data/clean/f_761_wenhao.py b/data/clean/f_761_wenhao.py index dfde8305..7cb4838a 100644 --- a/data/clean/f_761_wenhao.py +++ b/data/clean/f_761_wenhao.py @@ -16,7 +16,7 @@ def f_761(df, column): - column (str): The name of the column in the DataFrame that contains the categories. Output: - - matplotlib.axes._subplots.AxesSubplot: The Axes object for the generated plot. + - matplotlib.axes._subplots.Axes: The Axes object for the generated plot. Requirements: - pandas @@ -30,7 +30,6 @@ def f_761(df, column): >>> df = pd.DataFrame({'Type': ['A', 'A', 'C', 'E', 'D', 'E', 'D']}) >>> ax = f_761(df, 'Type') - # This generates and displays a bar chart showing the distribution of each category within the 'Type' column, including categories with zero occurrences. """ # Define the categories CATEGORIES = ['A', 'B', 'C', 'D', 'E'] diff --git a/data/clean/f_762_wenhao.py b/data/clean/f_762_wenhao.py index 16a1a8e8..61d6fd36 100644 --- a/data/clean/f_762_wenhao.py +++ b/data/clean/f_762_wenhao.py @@ -11,7 +11,7 @@ def f_762(df): df (pandas.DataFrame): The DataFrame containing numerical columns to be used for correlation. Returns: - matplotlib.axes._subplots.AxesSubplot: The matplotlib Axes object representing the heatmap. + matplotlib.axes._subplots.Axes: The matplotlib Axes object representing the heatmap. Requirements: - pandas @@ -22,7 +22,7 @@ def f_762(df): >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) >>> ax = f_762(df) >>> type(ax) - + """ correlation_matrix = df.corr() diff --git a/data/clean/f_765_wenhao.py b/data/clean/f_765_wenhao.py index 22757d9f..a0345ab3 100644 --- a/data/clean/f_765_wenhao.py +++ b/data/clean/f_765_wenhao.py @@ -24,10 +24,11 @@ def f_765(person_names, email_domains, num_records=5): - ValueError: If the number of names provided is less than the number of records requested or if no email domains are provided. Example: + >>> random.seed(0) # Initialize random seed >>> f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2) Name Email - 0 John Doe john[at]yahoo.com - 1 Jane Smith jane[at]gmail.com + 0 Jane Smith jane[at]gmail.com + 1 John Doe john[at]yahoo.com >>> f_765(['Alice'], ['outlook.com'], 1) Name Email 0 Alice alice[at]outlook.com diff --git a/data/clean/f_770_wenhao.py b/data/clean/f_770_wenhao.py index 17a0b58b..79426deb 100644 --- a/data/clean/f_770_wenhao.py +++ b/data/clean/f_770_wenhao.py @@ -26,8 +26,8 @@ def f_770(word: str) -> dict: - The function uses the `string` library to get a string of lowercase alphabets. Example: - >>> f_770('abcdef') - {'ab': 1, 'ac': 0, 'ad': 0, ..., 'yx': 0, 'yz': 0, 'za': 0, ..., 'zx': 0, 'zy': 0} + >>> list(f_770('abcdef').items())[:5] + [('ab', 1), ('ac', 0), ('ad', 0), ('ae', 0), ('af', 0)] """ ALPHABETS = string.ascii_lowercase # Generate all two-letter combinations of alphabets diff --git a/data/open-eval.jsonl b/data/open-eval.jsonl index 1b19fc08..aa49b9c2 100644 --- a/data/open-eval.jsonl +++ b/data/open-eval.jsonl @@ -1,347 +1,347 @@ -{"task_id": "f_885", "prompt": "import re\nimport os\n\n\ndef f_885(request):\n \"\"\"\n Handles an HTTP GET request to retrieve a static file from the server.\n\n This function processes an HTTP GET request, extracts the filename from it, checks the existence of the file\n in the server's directory, and returns an HTTP response. The response either contains the file content (if found) or an\n appropriate error message (if not found or if the request is invalid).\n\n Parameters:\n - request (str): An HTTP GET request in string format. The expected format is \"GET / HTTP/1.1\".\n\n Returns:\n - str: An HTTP response string, which includes the status code, content length (for 200 OK responses), and the file content\n or an error message.\n\n Requirements:\n - os\n - re\n\n Examples:\n >>> f_885(\"GET /test.txt HTTP/1.1\")\n \"HTTP/1.1 200 OK\\r\\nContent-Length: \\r\\n\\r\\n\"\n >>> f_885(\"GET /nonexistent.txt HTTP/1.1\")\n \"HTTP/1.1 404 NOT FOUND\\r\\n\\r\\nFile Not Found\"\n >>> f_885(\"INVALID REQUEST\")\n \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n >>> f_885(\"GET /restricted.txt HTTP/1.1\") # Assuming an I/O error occurs\n \"HTTP/1.1 500 INTERNAL SERVER ERROR\\r\\n\\r\\nInternal Server Error\"\n \"\"\"", "canonical_solution": " match = re.match(r\"^GET /([\\w\\.\\-]+) HTTP/1\\.1$\", request)\n if match:\n file_name = match.group(1)\n if os.path.exists(file_name):\n try:\n with open(file_name, \"rb\") as file:\n content = file.read()\n response = f\"HTTP/1.1 200 OK\\r\\nContent-Length: {len(content)}\\r\\n\\r\\n{content.decode('utf-8')}\"\n except Exception:\n response = (\n \"HTTP/1.1 500 INTERNAL SERVER ERROR\\r\\n\\r\\nInternal Server Error\"\n )\n else:\n response = \"HTTP/1.1 404 NOT FOUND\\r\\n\\r\\nFile Not Found\"\n else:\n response = \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n\n return response", "test": "import unittest\nimport re\nimport os\nfrom unittest.mock import mock_open, patch\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_885 function.\"\"\"\n def setUp(self):\n \"\"\"Set up the environment for testing by creating test files.\"\"\"\n with open(\"test.txt\", \"w\", encoding=\"utf-8\") as f:\n f.write(\"This is a test file.\")\n def tearDown(self):\n \"\"\"Clean up the environment by deleting the test files created.\"\"\"\n os.remove(\"test.txt\")\n def test_file_found(self):\n \"\"\"Test the response when the requested file is found.\"\"\"\n request = \"GET /test.txt HTTP/1.1\"\n expected_response = (\n \"HTTP/1.1 200 OK\\r\\nContent-Length: 20\\r\\n\\r\\nThis is a test file.\"\n )\n self.assertEqual(f_885(request), expected_response)\n def test_file_not_found(self):\n \"\"\"Test the response when the requested file is not found.\"\"\"\n request = \"GET /nonexistent.txt HTTP/1.1\"\n expected_response = \"HTTP/1.1 404 NOT FOUND\\r\\n\\r\\nFile Not Found\"\n self.assertEqual(f_885(request), expected_response)\n def test_bad_request(self):\n \"\"\"Test the response for a badly formatted request.\"\"\"\n request = \"BAD REQUEST\"\n expected_response = \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n self.assertEqual(f_885(request), expected_response)\n def test_empty_request(self):\n \"\"\"Test the response for an empty request.\"\"\"\n request = \"\"\n expected_response = \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n self.assertEqual(f_885(request), expected_response)\n def test_invalid_method_request(self):\n \"\"\"Test the response for a request with an invalid HTTP method.\"\"\"\n request = \"POST /test.txt HTTP/1.1\"\n expected_response = \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n self.assertEqual(f_885(request), expected_response)\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"data\")\n def test_internal_server_error(self, mock_file):\n \"\"\"Test the response when there's an internal server error (e.g., file read error).\"\"\"\n mock_file.side_effect = Exception(\"Mocked exception\")\n request = \"GET /test.txt HTTP/1.1\"\n expected_response = (\n \"HTTP/1.1 500 INTERNAL SERVER ERROR\\r\\n\\r\\nInternal Server Error\"\n )\n self.assertEqual(f_885(request), expected_response)", "apis": ["os.path.exists", "os.path", "re.match"], "libs": ["re", "os"], "doc": {"description": ["Handles an HTTP GET request to retrieve a static file from the server.", "This function processes an HTTP GET request, extracts the filename from it, checks the existence of the file", "in the server's directory, and returns an HTTP response. The response either contains the file content (if found) or an", "appropriate error message (if not found or if the request is invalid)."], "note": [], "params": ["request (str): An HTTP GET request in string format. The expected format is \"GET / HTTP/1.1\"."], "returns": ["str: An HTTP response string, which includes the status code, content length (for 200 OK responses), and the file content", "or an error message."], "reqs": ["os", "re"], "raises": [], "example": ["Examples:", ">>> f_885(\"GET /test.txt HTTP/1.1\")", "\"HTTP/1.1 200 OK\\r\\nContent-Length: \\r\\n\\r\\n\"", ">>> f_885(\"GET /nonexistent.txt HTTP/1.1\")", "\"HTTP/1.1 404 NOT FOUND\\r\\n\\r\\nFile Not Found\"", ">>> f_885(\"INVALID REQUEST\")", "\"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"", ">>> f_885(\"GET /restricted.txt HTTP/1.1\") # Assuming an I/O error occurs", "\"HTTP/1.1 500 INTERNAL SERVER ERROR\\r\\n\\r\\nInternal Server Error\""]}} -{"task_id": "f_388", "prompt": "import random\nfrom datetime import datetime\nimport matplotlib.pyplot as plt\n\ndef f_388(epoch_milliseconds, seed=None):\n \"\"\"\n Generate and draw a sales trend for different categories from a particular epoch milliseconds\n to the current time.\n\n The function selects category from ['Electronics', 'Clothing', 'Home', 'Books', 'Sports'].\n Each day's sales are randomly determined between 10 and 50 units for each category.\n The plot's x-axis represents 'Days since (the start date)', and the y-axis represents 'Sales' units.\n\n Parameters:\n - epoch_milliseconds (int): Start time. Must be positive and before current time.\n - seed (int, optional): Seed for random number generation. Default is None (no seed).\n\n Returns:\n - sales_data (dict): Sales data for different categories over days.\n - ax (plt.Axes): The plot depicting the sales trend.\n\n Requirements:\n - random\n - datetime.datetime\n - matplotlib\n\n Example:\n >>> random.seed(42)\n >>> sales_data, ax = f_388(1236472051807, seed=42)\n >>> type(sales_data)\n \n >>> list(sales_data['Electronics'])[:3]\n [50, 24, 47]\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " CATEGORIES = [\"Electronics\", \"Clothing\", \"Home\", \"Books\", \"Sports\"]\n\n if seed is not None:\n random.seed(seed)\n\n if epoch_milliseconds < 0:\n raise ValueError(\"Start time cannot be negative.\")\n\n start_time = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n current_time = datetime.now()\n days_diff = (current_time - start_time).days\n if days_diff <= 0:\n raise ValueError(\"Start date must be before current time.\")\n\n sales_data = {category: [0] * days_diff for category in CATEGORIES}\n\n for i in range(days_diff):\n for category in CATEGORIES:\n sales = random.randint(10, 50)\n sales_data[category][i] += sales\n\n fig, ax = plt.subplots()\n for category, sales in sales_data.items():\n ax.plot(range(days_diff), sales, label=category)\n\n ax.set_xlabel(\"Days since \" + start_time.strftime(\"%Y-%m-%d %H:%M:%S\"))\n ax.set_ylabel(\"Sales\")\n ax.legend()\n\n return sales_data, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nfrom datetime import datetime\nfrom datetime import timedelta\nclass TestCases(unittest.TestCase):\n def _check_sales_data(self, sales_data, expected_days):\n \"\"\"Utility function to validate sales data.\"\"\"\n self.assertIsInstance(sales_data, dict)\n self.assertEqual(\n set(sales_data.keys()),\n set([\"Electronics\", \"Clothing\", \"Home\", \"Books\", \"Sports\"]),\n )\n for category, sales in sales_data.items():\n self.assertEqual(len(sales), expected_days)\n for sale in sales:\n self.assertGreaterEqual(sale, 10)\n self.assertLessEqual(sale, 50)\n def test_case_1(self):\n # Basic test on manual example - Jan 1 2021\n sales_data, ax = f_388(1609459200000, seed=1)\n self.assertIsInstance(sales_data, dict)\n self.assertIsInstance(ax, plt.Axes)\n self._check_sales_data(\n sales_data,\n (datetime.now() - datetime.fromtimestamp(1609459200000 / 1000.0)).days,\n )\n self.assertEqual(ax.get_ylabel(), \"Sales\")\n def test_case_2(self):\n # Basic test on current date - should raise error\n current_epoch = int(datetime.now().timestamp() * 1000)\n with self.assertRaises(ValueError):\n f_388(current_epoch, seed=2)\n def test_case_3(self):\n # Test random seed\n t = 1609459200000\n sales_data1, _ = f_388(t, seed=42)\n sales_data2, _ = f_388(t, seed=42)\n sales_data3, _ = f_388(t, seed=3)\n self.assertEqual(sales_data1, sales_data2)\n self.assertNotEqual(sales_data1, sales_data3)\n def test_case_4(self):\n # Test that future date raises ValueError\n future_epoch = int((datetime.now() + timedelta(days=1)).timestamp() * 1000)\n with self.assertRaises(ValueError):\n f_388(future_epoch, seed=4)\n def test_case_5(self):\n # Test that negative epoch milliseconds raise an error\n with self.assertRaises(ValueError):\n f_388(-1609459200000, seed=5)\n def test_case_6(self):\n # Test that non-integer types for epoch milliseconds raise a TypeError\n with self.assertRaises(TypeError):\n f_388(\"1609459200000\", seed=6)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["random.randint", "random.seed", "datetime.datetime.now", "datetime.datetime.fromtimestamp", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "random", "datetime"], "doc": {"description": ["Generate and draw a sales trend for different categories from a particular epoch milliseconds", "to the current time.", "The function selects category from ['Electronics', 'Clothing', 'Home', 'Books', 'Sports'].", "Each day's sales are randomly determined between 10 and 50 units for each category.", "The plot's x-axis represents 'Days since (the start date)', and the y-axis represents 'Sales' units."], "note": [], "params": ["epoch_milliseconds (int): Start time. Must be positive and before current time.", "seed (int, optional): Seed for random number generation. Default is None (no seed)."], "returns": ["sales_data (dict): Sales data for different categories over days.", "ax (plt.Axes): The plot depicting the sales trend."], "reqs": ["random", "datetime.datetime", "matplotlib"], "raises": [], "example": [">>> random.seed(42)", ">>> sales_data, ax = f_388(1236472051807, seed=42)", ">>> type(sales_data)", "", ">>> list(sales_data['Electronics'])[:3]", "[50, 24, 47]", ">>> type(ax)", ""]}} -{"task_id": "f_535", "prompt": "import pandas as pd\nimport os\n\ndef f_535(filename):\n \"\"\"\n Read a CSV file of pandas, reverse the order of the lines and write the inverted lines back into the file. Then move the cursor back to the beginning of the file. \n The header should not be inverted and the file may be empty.\n\n Parameters:\n - filename (str): The name of the CSV file.\n\n Returns:\n - filename (str): The name of the CSV file.\n\n Requirements:\n - os\n - pandas\n\n Example:\n >>> f_535('file.csv')\n 'file.csv'\n \"\"\"", "canonical_solution": " if not os.path.exists(filename):\n return filename\n\n # Check if empty\n with open(filename, 'r') as file:\n if not file.read(1):\n return filename\n\n df = pd.read_csv(filename)\n df = df.iloc[::-1]\n df.to_csv(filename, index=False)\n\n with open(filename, 'r+') as file:\n file.seek(0)\n\n return filename", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def base(self, filename, contents, expected):\n # Create file\n with open(filename, 'w') as f:\n f.write(contents)\n # Run function\n f_535(filename)\n # Check file\n with open(filename, 'r') as f:\n self.assertEqual(f.read().strip(), expected.strip())\n # Remove file\n os.remove(filename)\n def test_case_1(self):\n self.base('file.csv', 'a,b,c\\n1,2,3\\n4,5,6\\n7,8,9', 'a,b,c\\n7,8,9\\n4,5,6\\n1,2,3')\n def test_case_2(self):\n self.base('file.csv', 'a,b,c\\n1,2,3\\n4,5,6', 'a,b,c\\n4,5,6\\n1,2,3')\n def test_case_3(self):\n self.base('file.csv', 'a,b,c\\n1,2,3', 'a,b,c\\n1,2,3')\n def test_case_4(self):\n self.base('file.csv', 'a,b,c', 'a,b,c')\n def test_case_5(self):\n self.base('file.csv', '', '')", "apis": ["os.path.exists", "pandas.read_csv", "os.path"], "libs": ["pandas", "os"], "doc": {"description": ["Read a CSV file of pandas, reverse the order of the lines and write the inverted lines back into the file. Then move the cursor back to the beginning of the file.", "The header should not be inverted and the file may be empty."], "note": [], "params": ["filename (str): The name of the CSV file."], "returns": ["filename (str): The name of the CSV file."], "reqs": ["os", "pandas"], "raises": [], "example": [">>> f_535('file.csv')", "'file.csv'"]}} -{"task_id": "f_609", "prompt": "import base64\nimport re\nfrom html import unescape\nimport textwrap\n\ndef f_609(raw_string, line_length):\n \"\"\"\n Decode a raw string from base64, decouple HTML entities, replace multiple spaces with a single space, strip leading and subsequent spaces, and wrap text to a certain line length.\n\n Parameters:\n - raw_string (str): The base64 encoded string.\n - line_length (int): The maximum length of a line.\n\n Returns:\n - wrapped_text (str): The cleaned and formatted string.\n\n Requirements:\n - base64\n - re\n - html\n - textwrap\n\n Example:\n >>> f_609('SGVsbG8sICBXb3JsZCEgICAg', 5)\n 'Hello\\\\n, Wor\\\\nld!'\n \"\"\"", "canonical_solution": "\n # Decode the string from base64\n decoded_string = base64.b64decode(raw_string).decode('utf-8')\n\n # Unescape HTML entities\n unescaped_string = unescape(decoded_string)\n\n # Replace multiple spaces with a single space and strip leading and trailing spaces\n cleaned_string = re.sub(' +', ' ', unescaped_string).strip()\n\n # Wrap the text\n wrapped_text = textwrap.fill(cleaned_string, line_length)\n\n return wrapped_text", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 5), 'Hello\\n, Wor\\nld!')\n def test_case_2(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 10), 'Hello,\\nWorld!')\n def test_case_3(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 20), 'Hello, World!')\n def test_case_4(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 1), 'H\\ne\\nl\\nl\\no\\n,\\nW\\no\\nr\\nl\\nd\\n!')\n def test_case_5(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 2), 'He\\nll\\no,\\nWo\\nrl\\nd!')", "apis": ["base64.b64decode", "re.sub", "html.unescape", "textwrap.fill"], "libs": ["base64", "html", "re", "textwrap"], "doc": {"description": ["Decode a raw string from base64, decouple HTML entities, replace multiple spaces with a single space, strip leading and subsequent spaces, and wrap text to a certain line length."], "note": [], "params": ["raw_string (str): The base64 encoded string.", "line_length (int): The maximum length of a line."], "returns": ["wrapped_text (str): The cleaned and formatted string."], "reqs": ["base64", "re", "html", "textwrap"], "raises": [], "example": [">>> f_609('SGVsbG8sICBXb3JsZCEgICAg', 5)", "'Hello\\\\n, Wor\\\\nld!'"]}} -{"task_id": "f_820", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndef f_820(array, features=None, seed=None):\n \"\"\"\n Shuffles the columns of a given 2D numpy array and visualizes it as a heatmap.\n\n Parameters:\n - array (ndarray): The 2D numpy array to shuffle and plot. It must not be empty.\n - features (list of str, optional): Custom labels for the columns after shuffling.\n If not specified, default numerical labels are used.\n The list must match the number of columns in 'array'.\n - seed (int, optional): Seed for the random number generator to ensure reproducibility of the shuffle.\n\n Returns:\n - Axes: The matplotlib Axes object containing the heatmap.\n\n Raises:\n - ValueError: If 'features' is provided and does not match the number of columns in 'array'; and\n if 'array' is empty or not 2-dimensional.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - seaborn\n\n Notes:\n - This function uses the features list as labels for the heatmap's x-axis if features is provided;\n otherwise, it defaults to strings of the numerical labels starting from 1 up to the number of\n columns in the array.\n\n Example:\n >>> np.random.seed(0)\n >>> array = np.random.rand(2, 5)\n >>> ax = f_820(array, features=['A', 'B', 'C', 'D', 'E'], seed=1)\n >>> type(ax)\n \n >>> ax.collections[0].get_array().data.flatten()\n array([0.60276338, 0.71518937, 0.4236548 , 0.5488135 , 0.54488318,\n 0.891773 , 0.43758721, 0.38344152, 0.64589411, 0.96366276])\n \"\"\"", "canonical_solution": "\n if seed is not None:\n np.random.seed(seed)\n\n if array.size == 0 or len(array.shape) != 2:\n raise ValueError(\"Input array must be 2-dimensional and non-empty.\")\n\n if features is not None and len(features) != array.shape[1]:\n raise ValueError(\"Features list must match the number of columns in the array.\")\n\n shuffled_array = np.random.permutation(array.T).T\n\n fig, ax = plt.subplots()\n sns.heatmap(\n shuffled_array,\n xticklabels=features if features is not None else np.arange(array.shape[1]) + 1,\n ax=ax,\n )\n\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n self.array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.expected_labels = [\"1\", \"2\", \"3\", \"4\", \"5\"]\n def test_default_features(self):\n \"\"\"Test heatmap with default features.\"\"\"\n ax = f_820(self.array)\n xticklabels = [tick.get_text() for tick in ax.get_xticklabels()]\n self.assertEqual(xticklabels, self.expected_labels)\n self.assertTrue(len(ax.collections), 1)\n def test_custom_features(self):\n \"\"\"Test heatmap with custom features.\"\"\"\n custom_labels = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n ax = f_820(self.array, features=custom_labels)\n xticklabels = [tick.get_text() for tick in ax.get_xticklabels()]\n self.assertEqual(xticklabels, custom_labels)\n self.assertTrue(len(ax.collections), 1)\n def test_features_mismatch(self):\n \"\"\"Test for error when features list does not match array dimensions.\"\"\"\n with self.assertRaises(ValueError):\n f_820(self.array, features=[\"A\", \"B\"])\n def test_seed_reproducibility(self):\n \"\"\"Test if seeding makes shuffling reproducible.\"\"\"\n ax1 = f_820(self.array, seed=42)\n ax2 = f_820(self.array, seed=42)\n heatmap_data1 = ax1.collections[0].get_array().data\n heatmap_data2 = ax2.collections[0].get_array().data\n np.testing.assert_array_equal(heatmap_data1, heatmap_data2)\n def test_empty_array(self):\n \"\"\"Test for handling an empty array.\"\"\"\n with self.assertRaises(ValueError):\n f_820(np.array([]))\n def tearDown(self):\n \"\"\"Cleanup plot figures after each test.\"\"\"\n plt.close(\"all\")", "apis": ["numpy.arange", "numpy.random", "numpy.random.seed", "seaborn.heatmap", "numpy.random.permutation", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "seaborn"], "doc": {"description": ["Shuffles the columns of a given 2D numpy array and visualizes it as a heatmap.", "Notes:", "- This function uses the features list as labels for the heatmap's x-axis if features is provided;", "otherwise, it defaults to strings of the numerical labels starting from 1 up to the number of", "columns in the array."], "note": [], "params": ["array (ndarray): The 2D numpy array to shuffle and plot. It must not be empty.", "features (list of str, optional): Custom labels for the columns after shuffling.", "If not specified, default numerical labels are used.", "The list must match the number of columns in 'array'.", "seed (int, optional): Seed for the random number generator to ensure reproducibility of the shuffle."], "returns": ["Axes: The matplotlib Axes object containing the heatmap."], "reqs": ["numpy", "matplotlib.pyplot", "seaborn"], "raises": ["ValueError: If 'features' is provided and does not match the number of columns in 'array'; and", "if 'array' is empty or not 2-dimensional."], "example": [">>> np.random.seed(0)", ">>> array = np.random.rand(2, 5)", ">>> ax = f_820(array, features=['A', 'B', 'C', 'D', 'E'], seed=1)", ">>> type(ax)", "", ">>> ax.collections[0].get_array().data.flatten()", "array([0.60276338, 0.71518937, 0.4236548 , 0.5488135 , 0.54488318,", "0.891773 , 0.43758721, 0.38344152, 0.64589411, 0.96366276])"]}} -{"task_id": "f_813", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_813(data: np.ndarray) -> plt.Axes:\n \"\"\"\n Plots the cumulative probability distribution of a given NumPy array of numbers,\n representing how the cumulative probability increases with the sorted data indexes.\n\n Parameters:\n - data (numpy.ndarray): The input NumPy array of non-negative numbers.\n\n Returns:\n - matplotlib.pyplot.Axes: The plot of cumulative probabilities.\n\n Requirements:\n - numpy\n - matplotlib\n\n Raises:\n - ValueError: If the input array contains negative numbers or NaNs.\n - TypeError: If the input array contains non-numeric inputs.\n\n Note:\n - In case of an all-zeros input, the cumulative probability remains at 0 across all indexes.\n - The plot uses marker ('o') and a solid line ('-') for the cumulative probability curve.\n - The plot is titled \"Cumulative Probability Plot\", with \"Index\" on the x-axis and\n \"Cumulative Probability\" on the y-axis.\n\n Example:\n >>> ax = f_813(np.array([1, 2, 3, 4, 5]))\n >>> ax.get_title()\n 'Cumulative Probability Plot'\n \"\"\"", "canonical_solution": " if np.any(data < 0) or np.isnan(data).any():\n raise ValueError(\"Input array contains negative numbers or NaNs.\")\n\n if not np.issubdtype(data.dtype, np.number):\n raise TypeError(\"Input array contains non-numeric values.\")\n\n data_sorted = np.sort(data)\n cumulative_prob = (\n np.cumsum(data_sorted) / np.sum(data_sorted)\n if np.sum(data_sorted) != 0\n else np.zeros_like(data_sorted)\n )\n fig, ax = plt.subplots()\n ax.plot(cumulative_prob, marker=\"o\", linestyle=\"-\")\n ax.set_xlabel(\"Index\")\n ax.set_ylabel(\"Cumulative Probability\")\n ax.set_title(\"Cumulative Probability Plot\")\n\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.lines import Line2D\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n plt.close(\"all\")\n def helper_assert_plot_attributes(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n self.assertIn(\"Cumulative Probability Plot\", ax.get_title())\n self.assertIn(\"Index\", ax.get_xlabel())\n self.assertIn(\"Cumulative Probability\", ax.get_ylabel())\n lines = ax.get_lines()\n self.assertIsInstance(\n lines[0], Line2D, \"The plot should contain a Line2D object.\"\n )\n self.assertEqual(lines[0].get_marker(), \"o\", \"The marker should be 'o'.\")\n self.assertEqual(lines[0].get_linestyle(), \"-\", \"The linestyle should be '-'.\")\n def helper_assert_cumulative_probability_correctness(\n self, ax, expected_cumulative_prob\n ):\n line = ax.get_lines()[0]\n np.testing.assert_array_almost_equal(\n line.get_ydata(),\n expected_cumulative_prob,\n decimal=2,\n err_msg=\"Cumulative probability calculation is incorrect.\",\n )\n def test_negative_numbers(self):\n data = np.array([-1, 0, 1, 2, 3])\n with self.assertRaises(ValueError):\n f_813(data)\n def test_nan_values(self):\n data = np.array([1, 2, 3, np.nan, 5])\n with self.assertRaises(ValueError):\n f_813(data)\n def test_non_numeric_values(self):\n data = np.array([1, 2, 3, \"hello\", 5])\n with self.assertRaises(TypeError):\n f_813(data)\n def test_increasing_array(self):\n data = np.array([1, 2, 3])\n ax = f_813(data)\n expected_cumulative_prob = np.array([1 / 6, 1 / 2, 1])\n self.helper_assert_plot_attributes(ax=ax)\n self.helper_assert_cumulative_probability_correctness(\n ax=ax, expected_cumulative_prob=expected_cumulative_prob\n )\n def test_constant_array(self):\n data = np.array([1, 1, 1, 1, 1])\n ax = f_813(data)\n self.helper_assert_plot_attributes(ax)\n expected_cumulative_prob = np.array([0.2, 0.4, 0.6, 0.8, 1.0])\n self.helper_assert_cumulative_probability_correctness(\n ax=ax, expected_cumulative_prob=expected_cumulative_prob\n )\n def test_zeros_array(self):\n data = np.array([0, 0, 0, 0, 0])\n ax = f_813(data)\n self.helper_assert_plot_attributes(ax)\n expected_cumulative_prob = np.array([0, 0, 0, 0, 0])\n self.helper_assert_cumulative_probability_correctness(\n ax=ax, expected_cumulative_prob=expected_cumulative_prob\n )\n def test_single_element_array(self):\n data = np.array([7])\n ax = f_813(data)\n self.helper_assert_plot_attributes(ax)\n expected_cumulative_prob = np.array([1])\n self.helper_assert_cumulative_probability_correctness(\n ax=ax, expected_cumulative_prob=expected_cumulative_prob\n )", "apis": ["numpy.ndarray", "numpy.any", "numpy.cumsum", "numpy.issubdtype", "matplotlib.pyplot.Axes", "numpy.isnan", "numpy.sort", "numpy.number", "numpy.sum", "matplotlib.pyplot.subplots", "numpy.zeros_like"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Plots the cumulative probability distribution of a given NumPy array of numbers,", "representing how the cumulative probability increases with the sorted data indexes."], "note": ["In case of an all-zeros input, the cumulative probability remains at 0 across all indexes.", "The plot uses marker ('o') and a solid line ('-') for the cumulative probability curve.", "The plot is titled \"Cumulative Probability Plot\", with \"Index\" on the x-axis and", "\"Cumulative Probability\" on the y-axis."], "params": ["data (numpy.ndarray): The input NumPy array of non-negative numbers."], "returns": ["matplotlib.pyplot.Axes: The plot of cumulative probabilities."], "reqs": ["numpy", "matplotlib"], "raises": ["ValueError: If the input array contains negative numbers or NaNs.", "TypeError: If the input array contains non-numeric inputs."], "example": [">>> ax = f_813(np.array([1, 2, 3, 4, 5]))", ">>> ax.get_title()", "'Cumulative Probability Plot'"]}} -{"task_id": "f_836", "prompt": "import re\nimport pandas as pd\nfrom scipy.stats import gaussian_kde\nfrom scipy import linalg\nimport matplotlib.pyplot as plt\n\n\ndef f_836(text):\n \"\"\"\n This code takes a text input, calculates the lengths of the words, \n and visualizes the distribution of word lengths using a histogram and a KDE curve (if applicable) on a matplotlib subplot.\n\n Parameters:\n text (str): The text string to be analyzed. The function can handle strings with various types \n of characters and punctuation.\n\n Returns:\n matplotlib.axes._subplots.AxesSubplot: An Axes object showing the histogram and optionally the KDE \n plot of word lengths. This visual representation helps in \n understanding the distribution of word lengths in the given text.\n\n Requirements:\n - re\n - matplotlib\n - scipy\n - matplotlib\n\n Example:\n >>> ax = f_836('Hello world! This is a test.')\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " words = re.split(r\"\\W+\", text)\n word_counts = [len(word) for word in words if word]\n\n _, ax = plt.subplots()\n\n if word_counts: # Check if word_counts is not empty\n ax.hist(word_counts, bins=30, edgecolor='black', alpha=0.7)\n\n # Add KDE plot if applicable\n if len(word_counts) > 1 and np.var(word_counts) != 0:\n try:\n kde = gaussian_kde(word_counts)\n x_range = np.linspace(min(word_counts), max(word_counts), 100)\n ax.plot(x_range, kde(x_range), color='red') # KDE line in red\n except linalg.LinAlgError:\n # Handle the singular matrix error\n pass\n\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the f_836 function\"\"\"\n def test_simple_sentence(self):\n \"\"\"Test a simple sentence\"\"\"\n ax1 = f_836(\"This is a test\")\n self.assertIsInstance(ax1, plt.Axes)\n # The number of bars might differ due to matplotlib's binning strategy\n unique_word_lengths = {len(word) for word in \"This is a test\".split() if word}\n self.assertTrue(\n len(ax1.patches) >= len(unique_word_lengths),\n \"Incorrect number of bars for a simple sentence\",\n )\n def test_empty_string(self):\n \"\"\"Test an empty string\"\"\"\n ax2 = f_836(\"\")\n self.assertIsInstance(ax2, plt.Axes)\n self.assertEqual(\n len(ax2.patches), 0, \"There should be no bars for an empty string\"\n )\n def test_special_characters(self):\n \"\"\"Test special characters and numbers\"\"\"\n ax3 = f_836(\"Hello, world! 1234\")\n self.assertIsInstance(ax3, plt.Axes)\n # The number of bars might differ due to matplotlib's binning strategy\n unique_word_lengths = {\n len(word) for word in \"Hello, world! 1234\".split() if word\n }\n self.assertTrue(\n len(ax3.patches) >= len(unique_word_lengths),\n \"Incorrect handling of special characters and numbers\",\n )\n def test_repeated_words(self):\n \"\"\"Test repeated words\"\"\"\n ax4 = f_836(\"repeat repeat repeat\")\n self.assertIsInstance(ax4, plt.Axes)\n # Only one unique word length: 6\n self.assertTrue(len(ax4.patches) >= 1, \"Incorrect handling of repeated words\")\n def test_long_text(self):\n \"\"\"Test a long text\"\"\"\n text = \"A long text with multiple words of different lengths\"\n ax5 = f_836(text)\n self.assertIsInstance(ax5, plt.Axes)\n # Adjust expectation for number of bars due to matplotlib's binning\n words = re.split(r\"\\W+\", text)\n word_counts = pd.Series([len(word) for word in words if word])\n expected_unique_lengths = len(set(word_counts))\n self.assertTrue(\n len(ax5.patches) >= expected_unique_lengths,\n \"Incorrect plot for a long text\",\n )\n def tearDown(self):\n plt.clf()", "apis": ["re.split", "scipy.stats.gaussian_kde", "matplotlib.pyplot.subplots", "scipy.linalg.LinAlgError"], "libs": ["matplotlib", "re", "scipy"], "doc": {"description": ["This code takes a text input, calculates the lengths of the words,", "and visualizes the distribution of word lengths using a histogram and a KDE curve (if applicable) on a matplotlib subplot."], "note": [], "params": ["text (str): The text string to be analyzed. The function can handle strings with various types", "of characters and punctuation."], "returns": ["matplotlib.axes._subplots.AxesSubplot: An Axes object showing the histogram and optionally the KDE", "plot of word lengths. This visual representation helps in", "understanding the distribution of word lengths in the given text."], "reqs": ["re", "matplotlib", "scipy", "matplotlib"], "raises": [], "example": [">>> ax = f_836('Hello world! This is a test.')", ">>> type(ax)", ""]}} -{"task_id": "f_842", "prompt": "import urllib.request\nimport os\nimport csv\nimport collections\n\n\ndef f_842(url, column_name, csv_file_path):\n \"\"\"\n Download a CSV file from a given URL, save it to a specified path, and count\n the occurrences of each value in a particular column. The function handles various\n scenarios including missing columns and file download errors.\n\n Parameters:\n url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL.\n column_name (str): The name of the column in the CSV file whose values are to be counted.\n The function will raise a ValueError if this column is not found.\n csv_file_path (str): The file path where the downloaded CSV file will be saved.\n If a file already exists at this path, it will be overwritten.\n\n Returns:\n dict: A dictionary mapping the values from the specified column to their\n corresponding occurrence counts.\n\n Raises:\n ValueError: If the specified column_name does not exist in the CSV file, the function\n will delete the downloaded file and raise a ValueError with a message\n stating \"The provided column_name '{column_name}' does not exist in the CSV file.\"\n\n Requirements:\n - urllib\n - os\n - csv\n - collections\n\n Example:\n >>> f_842('http://example.com/data.csv', 'category', 'downloaded_data.csv')\n {'cat1': 5, 'cat2': 3, 'cat3': 8}\n # This is a hypothetical output; the actual output will depend on the CSV data.\n\n Notes:\n - The downloaded CSV file is deleted after its contents have been processed.\n - The function only counts values in the specified column and ignores other data.\n \"\"\"", "canonical_solution": " urllib.request.urlretrieve(url, csv_file_path)\n\n with open(csv_file_path, \"r\", encoding=\"utf-8\") as f:\n reader = csv.DictReader(f)\n if column_name not in reader.fieldnames:\n os.remove(csv_file_path)\n raise ValueError(\n f\"The provided column_name '{column_name}' does not exist in the CSV file.\"\n )\n values = [row[column_name] for row in reader]\n\n os.remove(csv_file_path)\n\n return collections.Counter(values)", "test": "import unittest\nfrom unittest.mock import patch, mock_open\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_842 function.\"\"\"\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"category,other\\n\" + \"cat1,x\\n\" * 2 + \"cat2,y\\n\" * 2 + \"cat3,z\\n\",\n )\n def test_count_categories_data1(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function counts the occurrences of each category in the CSV file.\"\"\"\n result = f_842(\"mock_url\", \"category\", \"/mock/path/data1.csv\")\n self.assertEqual(result, {\"cat1\": 2, \"cat2\": 2, \"cat3\": 1})\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"name,other\\n\" + \"Alice,x\\n\" * 2 + \"Bob,y\\n\" + \"Charlie,z\\n\",\n )\n def test_count_names_data2(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function counts the occurrences of each name in the CSV file.\"\"\"\n result = f_842(\"mock_url\", \"name\", \"/mock/path/data2.csv\")\n self.assertEqual(result, {\"Alice\": 2, \"Bob\": 1, \"Charlie\": 1})\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"category,other\\n\" + \"cat1,x\\n\" * 2 + \"cat2,y\\n\" + \"cat3,z\\n\" * 2,\n )\n def test_count_categories_data3(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function counts the occurrences of each category in the CSV file.\"\"\"\n result = f_842(\"mock_url\", \"category\", \"/mock/path/data3.csv\")\n self.assertEqual(result, {\"cat1\": 2, \"cat2\": 1, \"cat3\": 2})\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"name,other\\n\" + \"Alice,x\\n\" * 3 + \"Bob,y\\n\" + \"Charlie,z\\n\",\n )\n def test_count_names_data3(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function counts the occurrences of each name in the CSV file.\"\"\"\n result = f_842(\"mock_url\", \"name\", \"/mock/path/data3.csv\")\n self.assertEqual(result, {\"Alice\": 3, \"Bob\": 1, \"Charlie\": 1})\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"name,other\\n\" + \"Alice,x\\n\" * 3 + \"Bob,y\\n\" + \"Charlie,z\\n\",\n )\n def test_non_existent_column(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function raises an exception when the specified column does not exist.\"\"\"\n with self.assertRaises(ValueError):\n f_842(\"mock_url\", \"non_existent_column\", \"/mock/path/data3.csv\")", "apis": ["csv.DictReader", "urllib.request.urlretrieve", "collections.Counter", "os.remove", "urllib.request"], "libs": ["csv", "collections", "os", "urllib"], "doc": {"description": ["Download a CSV file from a given URL, save it to a specified path, and count", "the occurrences of each value in a particular column. The function handles various", "scenarios including missing columns and file download errors.", "Notes:", "- The downloaded CSV file is deleted after its contents have been processed.", "- The function only counts values in the specified column and ignores other data."], "note": [], "params": ["url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL.", "column_name (str): The name of the column in the CSV file whose values are to be counted.", "The function will raise a ValueError if this column is not found.", "csv_file_path (str): The file path where the downloaded CSV file will be saved.", "If a file already exists at this path, it will be overwritten."], "returns": ["dict: A dictionary mapping the values from the specified column to their", "corresponding occurrence counts."], "reqs": ["urllib", "os", "csv", "collections"], "raises": ["ValueError: If the specified column_name does not exist in the CSV file, the function", "will delete the downloaded file and raise a ValueError with a message", "stating \"The provided column_name '{column_name}' does not exist in the CSV file.\""], "example": [">>> f_842('http://example.com/data.csv', 'category', 'downloaded_data.csv')", "{'cat1': 5, 'cat2': 3, 'cat3': 8}", "# This is a hypothetical output; the actual output will depend on the CSV data."]}} -{"task_id": "f_397", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom datetime import datetime\n\n\ndef f_397(column, data):\n \"\"\"\n Analyze and visualize statistical properties of a specified weather data column.\n\n This function calculates the sum, mean, minimum, and maximum values of a specified column in the given data.\n It also generates a histogram plot of the data in the column. The dataset is expected to be a list of weather\n observations, where each observation includes date, temperature, humidity, wind speed, and precipitation values.\n If the provided data list is empty, resulting in an empty DataFrame, the function handles it by setting:\n - The 'mean' value to np.nan.\n - The 'min' value to np.inf.\n - The 'max' value to -np.inf.\n\n Parameters:\n column (str): The column to analyze. Valid columns include 'Temperature', 'Humidity', 'Wind Speed', and 'Precipitation'.\n data (list of lists): The weather data where each inner list contains the following format:\n [Date (datetime object), Temperature (int), Humidity (int), Wind Speed (int), Precipitation (float)]\n\n Returns:\n - result (dict): A dictionary containing:\n - 'sum': Sum of the values in the specified column.\n - 'mean': Mean of the values in the specified column.\n - 'min': Minimum value in the specified column.\n - 'max': Maximum value in the specified column.\n - 'plot': A matplotlib BarContainer object of the histogram plot for the specified column.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> data = [[datetime(2022, 1, 1), -5, 80, 10, 0], [datetime(2022, 1, 3), -2, 83, 15, 0]]\n >>> result = f_397('Temperature', data)\n >>> result['sum']\n -7\n >>> type(result['plot'])\n \n \"\"\"", "canonical_solution": " COLUMNS = [\"Date\", \"Temperature\", \"Humidity\", \"Wind Speed\", \"Precipitation\"]\n df = pd.DataFrame(data, columns=COLUMNS)\n column_data = df[column]\n\n result = {\n \"sum\": np.sum(column_data),\n \"mean\": np.nan if df.empty else np.mean(column_data),\n \"min\": np.inf if df.empty else np.min(column_data),\n \"max\": -np.inf if df.empty else np.max(column_data),\n }\n\n _, _, ax = plt.hist(column_data)\n plt.title(f\"Histogram of {column}\")\n\n result[\"plot\"] = ax\n\n return result", "test": "import unittest\nimport matplotlib\nimport matplotlib.pyplot as plt\nfrom datetime import datetime\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.data = [\n [datetime(2022, 1, 1), -5, 80, 10, 0],\n [datetime(2022, 1, 2), -3, 85, 12, 0.5],\n [datetime(2022, 1, 3), -2, 83, 15, 0],\n [datetime(2022, 1, 4), -1, 82, 13, 0.2],\n [datetime(2022, 1, 5), 0, 80, 11, 0.1],\n ]\n def test_case_1(self):\n # Testing the 'Temperature' column\n result = f_397(\"Temperature\", self.data)\n self.assertEqual(result[\"sum\"], -11)\n self.assertEqual(result[\"mean\"], -2.2)\n self.assertEqual(result[\"min\"], -5)\n self.assertEqual(result[\"max\"], 0)\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def test_case_2(self):\n # Testing the 'Humidity' column\n result = f_397(\"Humidity\", self.data)\n self.assertEqual(result[\"sum\"], 410)\n self.assertEqual(result[\"mean\"], 82)\n self.assertEqual(result[\"min\"], 80)\n self.assertEqual(result[\"max\"], 85)\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def test_case_3(self):\n # Testing the 'Wind Speed' column\n result = f_397(\"Wind Speed\", self.data)\n self.assertEqual(result[\"sum\"], 61)\n self.assertEqual(result[\"mean\"], 12.2)\n self.assertEqual(result[\"min\"], 10)\n self.assertEqual(result[\"max\"], 15)\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def test_case_4(self):\n # Testing the 'Precipitation' column\n result = f_397(\"Precipitation\", self.data)\n self.assertAlmostEqual(result[\"sum\"], 0.8, places=6)\n self.assertAlmostEqual(result[\"mean\"], 0.16, places=6)\n self.assertAlmostEqual(result[\"min\"], 0, places=6)\n self.assertAlmostEqual(result[\"max\"], 0.5, places=6)\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def test_case_5(self):\n # Testing with empty data\n result = f_397(\"Temperature\", [])\n self.assertTrue(np.isnan(result[\"mean\"]))\n self.assertEqual(result[\"sum\"], 0)\n self.assertTrue(\n np.isinf(result[\"min\"]) and result[\"min\"] > 0\n ) # Checking for positive infinity for min\n self.assertTrue(\n np.isinf(result[\"max\"]) and result[\"max\"] < 0\n ) # Checking for negative infinity for max\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.inf", "numpy.mean", "numpy.nan", "matplotlib.pyplot.hist", "matplotlib.pyplot.title", "numpy.min", "numpy.max", "numpy.sum", "pandas.DataFrame"], "libs": ["numpy", "matplotlib", "pandas"], "doc": {"description": ["Analyze and visualize statistical properties of a specified weather data column.", "This function calculates the sum, mean, minimum, and maximum values of a specified column in the given data.", "It also generates a histogram plot of the data in the column. The dataset is expected to be a list of weather", "observations, where each observation includes date, temperature, humidity, wind speed, and precipitation values.", "If the provided data list is empty, resulting in an empty DataFrame, the function handles it by setting:", "- The 'mean' value to np.nan.", "- The 'min' value to np.inf.", "- The 'max' value to -np.inf."], "note": [], "params": ["column (str): The column to analyze. Valid columns include 'Temperature', 'Humidity', 'Wind Speed', and 'Precipitation'.", "data (list of lists): The weather data where each inner list contains the following format:", "[Date (datetime object), Temperature (int), Humidity (int), Wind Speed (int), Precipitation (float)]"], "returns": ["result (dict): A dictionary containing:", "'sum': Sum of the values in the specified column.", "'mean': Mean of the values in the specified column.", "'min': Minimum value in the specified column.", "'max': Maximum value in the specified column.", "'plot': A matplotlib BarContainer object of the histogram plot for the specified column."], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [[datetime(2022, 1, 1), -5, 80, 10, 0], [datetime(2022, 1, 3), -2, 83, 15, 0]]", ">>> result = f_397('Temperature', data)", ">>> result['sum']", "-7", ">>> type(result['plot'])", ""]}} -{"task_id": "f_781", "prompt": "import re\nimport pandas as pd\n\ndef f_781(input_df):\n \"\"\"\n Cleans the text in a pandas DataFrame column named 'text' by removing all special characters, punctuation marks, and spaces, then calculates the length of the cleaned text.\n\n Requirements:\n - re\n - pandas\n\n Parameters:\n - input_df (pandas.DataFrame): DataFrame with a column 'text' containing strings with alphanumeric and/or special characters.\n\n Returns:\n - pandas.DataFrame: A DataFrame with two new columns 'clean_text' and 'text_length', where 'clean_text' is the cleaned text and 'text_length' is its length.\n\n Examples:\n >>> df = pd.DataFrame({'text': ['Special $#! characters spaces 888323']})\n >>> print(f_781(df))\n clean_text text_length\n 0 Specialcharactersspaces888323 29\n >>> df = pd.DataFrame({'text': ['Hello, World!']})\n >>> print(f_781(df))\n clean_text text_length\n 0 HelloWorld 10\n \"\"\"", "canonical_solution": " def clean_text_and_calculate_length(row):\n if pd.isnull(row['text']):\n return pd.Series(['', 0], index=['clean_text', 'text_length'])\n cleaned_text = re.sub('[^A-Za-z0-9]+', '', str(row['text']))\n return pd.Series([cleaned_text, len(cleaned_text)], index=['clean_text', 'text_length'])\n \n return input_df.apply(clean_text_and_calculate_length, axis=1)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.df = pd.DataFrame({'text': ['hello', 'world', 'Special $#! characters spaces 888323', 'Hello, World!', '', None]})\n def test_clean_text_and_calculate_length(self):\n result = f_781(self.df)\n expected_clean_text = ['hello', 'world', 'Specialcharactersspaces888323', 'HelloWorld', '', '']\n expected_text_length = [5, 5, 29, 10, 0, 0]\n pd.testing.assert_series_equal(result['clean_text'], pd.Series(expected_clean_text, name='clean_text'), check_names=False)\n pd.testing.assert_series_equal(result['text_length'], pd.Series(expected_text_length, name='text_length'), check_names=False)\n def test_with_special_characters(self):\n df = pd.DataFrame({'text': ['@@@hello***', '%%%world$$$']})\n result = f_781(df)\n self.assertEqual(result['clean_text'].iloc[0], 'hello')\n self.assertEqual(result['clean_text'].iloc[1], 'world')\n self.assertEqual(result['text_length'].iloc[0], 5)\n self.assertEqual(result['text_length'].iloc[1], 5)\n def test_with_numeric_strings(self):\n df = pd.DataFrame({'text': ['123', '4567']})\n result = f_781(df)\n self.assertEqual(result['clean_text'].iloc[0], '123')\n self.assertEqual(result['clean_text'].iloc[1], '4567')\n self.assertEqual(result['text_length'].iloc[0], 3)\n self.assertEqual(result['text_length'].iloc[1], 4)\n def test_empty_and_none(self):\n df = pd.DataFrame({'text': ['', None]})\n result = f_781(df)\n self.assertEqual(result['clean_text'].iloc[0], '')\n self.assertEqual(result['clean_text'].iloc[1], '')\n self.assertEqual(result['text_length'].iloc[0], 0)\n self.assertEqual(result['text_length'].iloc[1], 0)\n def test_mixed_cases(self):\n df = pd.DataFrame({'text': ['HelloWorld', 'HELLOworld123']})\n result = f_781(df)\n self.assertEqual(result['clean_text'].iloc[0], 'HelloWorld')\n self.assertEqual(result['clean_text'].iloc[1], 'HELLOworld123')\n self.assertEqual(result['text_length'].iloc[0], 10)\n self.assertEqual(result['text_length'].iloc[1], 13)", "apis": ["pandas.isnull", "re.sub", "pandas.Series"], "libs": ["pandas", "re"], "doc": {"description": ["Cleans the text in a pandas DataFrame column named 'text' by removing all special characters, punctuation marks, and spaces, then calculates the length of the cleaned text."], "note": [], "params": ["input_df (pandas.DataFrame): DataFrame with a column 'text' containing strings with alphanumeric and/or special characters."], "returns": ["pandas.DataFrame: A DataFrame with two new columns 'clean_text' and 'text_length', where 'clean_text' is the cleaned text and 'text_length' is its length."], "reqs": ["re", "pandas"], "raises": [], "example": ["Examples:", ">>> df = pd.DataFrame({'text': ['Special $#! characters spaces 888323']})", ">>> print(f_781(df))", "clean_text text_length", "0 Specialcharactersspaces888323 29", ">>> df = pd.DataFrame({'text': ['Hello, World!']})", ">>> print(f_781(df))", "clean_text text_length", "0 HelloWorld 10"]}} -{"task_id": "f_383", "prompt": "from datetime import datetime, timedelta\nimport pytz\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_383(start_time, end_time):\n \"\"\"\n Plots the hourly difference between UTC and specified global time zones across a date range.\n\n This function visualizes the time difference in hours between UTC and predefined time zones for each day\n within the specified date range. Predefined time zones include UTC, America/Los_Angeles, Europe/Paris,\n Asia/Kolkata, and Australia/Sydney. The differences are plotted on a graph, using a distinct color for\n each time zone's time difference curve, selecting from [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"].\n\n Parameters:\n - start_time (str): The start date in the format \"yyyy-mm-dd\".\n - end_time (str): The end date in the format \"yyyy-mm-dd\".\n\n Returns:\n - matplotlib.axes.Axes: The Axes object with the plotted time differences in hours between UTC and \n other time zones.\n\n Requirements:\n - datetime.datetime\n - datetime.timedelta\n - pytz\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_383('2021-01-01', '2021-01-10')\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(18628.0, 0, '2021-01-01'), Text(18629.0, 0, '2021-01-02'), Text(18630.0, 0, '2021-01-03'), Text(18631.0, 0, '2021-01-04'), Text(18632.0, 0, '2021-01-05'), Text(18633.0, 0, '2021-01-06'), Text(18634.0, 0, '2021-01-07'), Text(18635.0, 0, '2021-01-08'), Text(18636.0, 0, '2021-01-09')]\n \"\"\"", "canonical_solution": " # Constants\n TIMEZONES = [\n \"UTC\",\n \"America/Los_Angeles\",\n \"Europe/Paris\",\n \"Asia/Kolkata\",\n \"Australia/Sydney\",\n ]\n COLORS = [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"]\n\n start_date = datetime.strptime(start_time, \"%Y-%m-%d\")\n end_date = datetime.strptime(end_time, \"%Y-%m-%d\")\n current_tz = pytz.timezone(\"UTC\")\n dates = np.arange(start_date, end_date, timedelta(days=1)).astype(datetime)\n differences = []\n for tz in TIMEZONES:\n other_tz = pytz.timezone(tz)\n difference = [\n (other_tz.localize(dt) - current_tz.localize(dt)).total_seconds() / 3600\n for dt in dates\n ]\n differences.append(difference)\n fig, ax = plt.subplots()\n for i, difference in enumerate(differences):\n ax.plot(dates, difference, color=COLORS[i % len(COLORS)], label=TIMEZONES[i])\n ax.set_xlabel(\"Date\")\n ax.set_ylabel(\"Time difference (hours)\")\n ax.legend()\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic functionality\n ax = f_383(\"2021-01-01\", \"2021-01-10\")\n self._common_assertions(ax)\n def test_case_2(self):\n # Test single day range\n ax = f_383(\"2021-01-01\", \"2021-01-01\")\n self._common_assertions(ax)\n def test_case_3(self):\n # Test leap year\n ax = f_383(\"2020-02-28\", \"2020-03-01\")\n self._common_assertions(ax)\n def test_case_4(self):\n # Test DST transition\n ax = f_383(\"2021-03-27\", \"2021-03-29\")\n self._common_assertions(ax)\n def test_case_5(self):\n # Test plotting consistency\n ax = f_383(\"2021-01-01\", \"2021-01-10\")\n colors = [line.get_color() for line in ax.get_lines()]\n self.assertEqual(len(set(colors)), len(colors)) # Check if colors are unique\n def test_case_6(self):\n # Testing input validation via invalid date format\n with self.assertRaises(ValueError):\n f_383(\"01-01-2021\", \"10-01-2021\")\n def _common_assertions(self, ax):\n \"\"\"Common assertions for all test cases\"\"\"\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_xlabel(), \"Date\")\n self.assertEqual(ax.get_ylabel().lower(), \"time difference (hours)\".lower())\n legend_labels = [text.get_text() for text in ax.get_legend().get_texts()]\n expected_timezones = [\n \"UTC\",\n \"America/Los_Angeles\",\n \"Europe/Paris\",\n \"Asia/Kolkata\",\n \"Australia/Sydney\",\n ]\n self.assertListEqual(legend_labels, expected_timezones)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "datetime.datetime.strptime", "datetime.timedelta", "pytz.timezone", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "datetime", "pytz"], "doc": {"description": ["Plots the hourly difference between UTC and specified global time zones across a date range.", "This function visualizes the time difference in hours between UTC and predefined time zones for each day", "within the specified date range. Predefined time zones include UTC, America/Los_Angeles, Europe/Paris,", "Asia/Kolkata, and Australia/Sydney. The differences are plotted on a graph, using a distinct color for", "each time zone's time difference curve, selecting from [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"]."], "note": [], "params": ["start_time (str): The start date in the format \"yyyy-mm-dd\".", "end_time (str): The end date in the format \"yyyy-mm-dd\"."], "returns": ["matplotlib.axes.Axes: The Axes object with the plotted time differences in hours between UTC and", "other time zones."], "reqs": ["datetime.datetime", "datetime.timedelta", "pytz", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_383('2021-01-01', '2021-01-10')", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(18628.0, 0, '2021-01-01'), Text(18629.0, 0, '2021-01-02'), Text(18630.0, 0, '2021-01-03'), Text(18631.0, 0, '2021-01-04'), Text(18632.0, 0, '2021-01-05'), Text(18633.0, 0, '2021-01-06'), Text(18634.0, 0, '2021-01-07'), Text(18635.0, 0, '2021-01-08'), Text(18636.0, 0, '2021-01-09')]"]}} -{"task_id": "f_818", "prompt": "import numpy as np\nimport pandas as pd\n\ndef f_818(rows, columns=[\"A\", \"B\", \"C\", \"D\", \"E\"], seed=0) -> pd.DataFrame:\n \"\"\"\n Create a Pandas DataFrame with a specified number of rows filled with random\n values in [0, 1) and shuffled columns.\n \n Note:\n - The columns should be unique and sorted in the ascending order.\n\n Parameters:\n rows (int): The number of rows for the DataFrame. Must not be negative.\n columns (list of str): Column names for the DataFrame.\n Defaults to ['A', 'B', 'C', 'D', 'E'].\n If it contains repeated columns, the function deduplicates\n it in a case and spacing sensitive way. If it is empty,\n the function returns an empty DataFrame.\n seed (int): The random seed for reproducibility.\n \n Returns:\n pd.DataFrame: A pandas DataFrame with shuffled columns.\n\n Requirements:\n - numpy\n - pandas\n\n Example:\n >>> df = f_818(10)\n >>> df.head(2)\n D E A C B\n 0 0.548814 0.715189 0.602763 0.544883 0.423655\n 1 0.645894 0.437587 0.891773 0.963663 0.383442\n \"\"\"", "canonical_solution": " np.random.seed(seed)\n columns = sorted(list(set(columns)))\n data = np.random.rand(rows, len(columns))\n np.random.shuffle(columns)\n df = pd.DataFrame(data, columns=columns)\n return df", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case - data and format correctness\n df = f_818(10, seed=0)\n default_columns = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n self.assertEqual(df.shape, (10, 5))\n for column in default_columns:\n self.assertEqual(df.dtypes[column], np.float64)\n self.assertEqual(len(set(df.columns)), len(default_columns))\n def test_case_2(self):\n # Test custom columns\n custom_columns = [\"X\", \"Y\", \"Z\"]\n df = f_818(5, columns=custom_columns, seed=0)\n self.assertTrue(all(column in custom_columns for column in df.columns))\n # assert first 2 rows data\n self.assertEqual(df.iloc[0].tolist(), [0.5488135039273248, 0.7151893663724195, 0.6027633760716439])\n \n def test_case_3(self):\n # Test custom rows\n for n_rows in [1, 10, 50]:\n df = f_818(n_rows)\n self.assertEqual(len(df), n_rows)\n def test_case_4(self):\n df = f_818(5, seed=42)\n self.assertEqual(df.iloc[0].tolist(), [0.3745401188473625, 0.9507143064099162, 0.7319939418114051, 0.5986584841970366, 0.15601864044243652])\n def test_case_5(self):\n # Test handling edge cases - negative rows\n with self.assertRaises(ValueError):\n f_818(-1)\n def test_case_6(self):\n # Test handling empty columns\n df = f_818(5, columns=[])\n self.assertTrue(df.empty)\n self.assertEqual(df.shape, (5, 0))\n def test_case_7(self):\n # Test handling duplicate columns\n df = f_818(5, columns=[\"A\", \"A\", \"B\", \"B\", \"C\"], seed=0)\n self.assertEqual(len(df.columns), 3)", "apis": ["numpy.random", "numpy.random.seed", "numpy.random.shuffle", "numpy.random.rand", "pandas.DataFrame"], "libs": ["numpy", "pandas"], "doc": {"description": ["Create a Pandas DataFrame with a specified number of rows filled with random", "values in [0, 1) and shuffled columns."], "note": ["The columns should be unique and sorted in the ascending order."], "params": ["rows (int): The number of rows for the DataFrame. Must not be negative.", "columns (list of str): Column names for the DataFrame.", "Defaults to ['A', 'B', 'C', 'D', 'E'].", "If it contains repeated columns, the function deduplicates", "it in a case and spacing sensitive way. If it is empty,", "the function returns an empty DataFrame.", "seed (int): The random seed for reproducibility."], "returns": ["pd.DataFrame: A pandas DataFrame with shuffled columns."], "reqs": ["numpy", "pandas"], "raises": [], "example": [">>> df = f_818(10)", ">>> df.head(2)", "D E A C B", "0 0.548814 0.715189 0.602763 0.544883 0.423655", "1 0.645894 0.437587 0.891773 0.963663 0.383442"]}} -{"task_id": "f_327", "prompt": "import random\nimport matplotlib.pyplot as plt\n\n\ndef f_327(points: int):\n \"\"\"\n Generate a plot of random numbers such that indices are on the x-axis and generated numbers are on the y-axis.\n\n Parameters:\n - points (int): Number of random points to generate.\n\n Returns:\n - Returns a tuple containing:\n - A list of generated random numbers.\n - A matplotlib Axes object representing the plot.\n\n Requirements:\n - random\n - matplotlib.pyplot\n\n Example:\n >>> import random\n >>> random.seed(0)\n >>> f_327(5)\n ([0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085], )\n >>> f_327(3)\n ([0.4049341374504143, 0.7837985890347726, 0.30331272607892745], )\n \"\"\"", "canonical_solution": " x = list(range(points))\n y = [random.random() for _ in range(points)]\n\n _, ax = plt.subplots()\n ax.plot(x, y)\n\n return y, ax", "test": "import unittest\nimport random\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n random.seed(0)\n y, _ = f_327(5)\n # Test correct number of points are generated\n self.assertEqual(len(y), 5)\n def test_case_2(self):\n random.seed(0)\n y, _ = f_327(5)\n # Test expected values\n self.assertTrue(all(0 <= num <= 1 for num in y))\n self.assertAlmostEqual(\n y,\n [\n 0.8444218515250481,\n 0.7579544029403025,\n 0.420571580830845,\n 0.25891675029296335,\n 0.5112747213686085,\n ],\n )\n def test_case_3(self):\n random.seed(0)\n # Test incorrect data types\n with self.assertRaises(TypeError):\n f_327(\"5\")\n with self.assertRaises(TypeError):\n f_327([])\n with self.assertRaises(TypeError):\n f_327(None)\n def test_case_4(self):\n random.seed(0)\n # Test handling 1 number\n y, ax = f_327(1)\n # Assert that 1 random number is generated\n self.assertEqual(len(y), 1)\n # Assert that the plot has the correct x and y data\n self.assertEqual(list(ax.lines[0].get_xdata()), [0])\n self.assertEqual(list(ax.lines[0].get_ydata()), y)\n def test_case_5(self):\n random.seed(0)\n # Test handling no random numbers\n y, ax = f_327(0)\n self.assertEqual(len(y), 0)\n # Assert that the plot has no data\n self.assertEqual(list(ax.lines[0].get_xdata()), [])\n self.assertEqual(list(ax.lines[0].get_ydata()), [])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "random.random"], "libs": ["matplotlib", "random"], "doc": {"description": ["Generate a plot of random numbers such that indices are on the x-axis and generated numbers are on the y-axis."], "note": [], "params": ["points (int): Number of random points to generate."], "returns": ["Returns a tuple containing:", "A list of generated random numbers.", "A matplotlib Axes object representing the plot."], "reqs": ["random", "matplotlib.pyplot"], "raises": [], "example": [">>> import random", ">>> random.seed(0)", ">>> f_327(5)", "([0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085], )", ">>> f_327(3)", "([0.4049341374504143, 0.7837985890347726, 0.30331272607892745], )"]}} -{"task_id": "f_532", "prompt": "import os\nimport random\nimport json\n\ndef f_532(directory, n):\n \"\"\"\n Create n random files in a directory with json content with the key 'number' and a random integer value between 1 and 100, and then reset the cursor to the beginning of each file.\n\n Parameters:\n - directory (str): The directory in which to generate the files.\n - n (int): The number of files to generate.\n\n Returns:\n - directory (str): The directory in which the files were generated.\n\n Requirements:\n - os\n - random\n - json\n\n Example:\n >>> f_532('/path/to/directory', 1)\n '/path/to/directory'\n \"\"\"", "canonical_solution": " if not os.path.exists(directory):\n os.makedirs(directory)\n\n for i in range(n):\n filename = str(i) + \".json\"\n filepath = os.path.join(directory, filename)\n\n with open(filepath, 'w') as file:\n json.dump({'number': random.randint(1, 100)}, file)\n file.seek(0)\n\n return directory", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n shutil.rmtree('./source', ignore_errors=True)\n shutil.rmtree('./src', ignore_errors=True)\n shutil.rmtree('./s', ignore_errors=True)\n def test_case_1(self):\n random.seed(0)\n directory = f_532('./source', 10)\n self.assertTrue(os.path.exists(directory))\n read_data = []\n for file in sorted(os.listdir(directory)):\n with open(os.path.join(directory, file), 'r') as f:\n read_data.append(json.load(f))\n self.assertEqual(read_data, [{'number': 50}, {'number': 98}, {'number': 54}, {'number': 6}, {'number': 34}, {'number': 66}, {'number': 63}, {'number': 52}, {'number': 39}, {'number': 62}])\n shutil.rmtree(directory)\n def test_case_2(self):\n random.seed(1)\n directory = f_532('./src', 1)\n self.assertTrue(os.path.exists(directory))\n read_data = []\n for file in os.listdir(directory):\n with open(os.path.join(directory, file), 'r') as f:\n read_data.append(json.load(f))\n self.assertEqual(read_data, [{'number': 18}])\n shutil.rmtree(directory)\n def test_case_3(self):\n directory = f_532('./s', 100)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 100)\n shutil.rmtree(directory)\n def test_case_4(self):\n directory = f_532('./s', 0)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 0)\n shutil.rmtree(directory)\n def test_case_5(self):\n random.seed(2)\n directory = f_532('./source', 1)\n self.assertTrue(os.path.exists(directory))\n read_data = []\n for file in os.listdir(directory):\n with open(os.path.join(directory, file), 'r') as f:\n read_data.append(json.load(f))\n self.assertEqual(read_data, [{'number': 8}])\n shutil.rmtree(directory)", "apis": ["os.path.exists", "random.randint", "os.path", "json.dump", "os.makedirs", "os.path.join"], "libs": ["json", "random", "os"], "doc": {"description": ["Create n random files in a directory with json content with the key 'number' and a random integer value between 1 and 100, and then reset the cursor to the beginning of each file."], "note": [], "params": ["directory (str): The directory in which to generate the files.", "n (int): The number of files to generate."], "returns": ["directory (str): The directory in which the files were generated."], "reqs": ["os", "random", "json"], "raises": [], "example": [">>> f_532('/path/to/directory', 1)", "'/path/to/directory'"]}} -{"task_id": "f_806", "prompt": "import os\nimport glob\nfrom pathlib import Path\nimport zipfile\n\n\ndef f_806(source_directory, target_directory, zip_name):\n \"\"\"\n Zip files with certain extensions from a source directory and save it as a zip file\n saved to a target directory.\n\n Parameters:\n - source_directory (str): The source directory containing the files to be zipped.\n - target_directory (str): The destination directory of the zip file to be created.\n If it does not exist, the function will create it.\n - zip_name (str): The name of the zip file to create (without extension; '.zip' will be added automatically).\n\n Returns:\n - str: The full path to the created zip file in the format \"/path/to/target_directory/zip_name.zip\".\n\n Raises:\n - OSError: If the source_directory does not exist.\n\n Requirements:\n - os\n - glob\n - pathlib\n - zipfile\n\n Note:\n - The valid extensions are: ['.txt', '.docx', '.xlsx', '.csv'].\n\n\n Example:\n >>> path = f_806('/path/to/source_directory', '/path/to/target_directory', 'zipped_files')\n >>> type(path)\n \n >>> path\n '/path/to/target_directory/zipped_files.zip'\n \"\"\"", "canonical_solution": " if not os.path.exists(source_directory):\n raise OSError(\"source_directory must exist.\")\n if not os.path.exists(target_directory):\n os.makedirs(target_directory, exist_ok=True)\n\n zip_path = os.path.join(target_directory, f\"{zip_name.strip()}.zip\")\n with zipfile.ZipFile(zip_path, \"w\") as zipf:\n for extension in [\".txt\", \".docx\", \".xlsx\", \".csv\"]:\n for file in glob.glob(\n f\"{source_directory}/**/*{extension}\", recursive=True\n ):\n zipf.write(file, arcname=Path(file).name)\n\n return os.path.abspath(zip_path)", "test": "import unittest\nimport tempfile\nimport os\nfrom pathlib import Path\nimport zipfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_source_dir = tempfile.TemporaryDirectory()\n self.temp_target_dir = tempfile.TemporaryDirectory()\n self.test_source_dir = self.temp_source_dir.name\n self.test_target_dir = self.temp_target_dir.name\n # Setup directory and files structure for testing\n self.files_structure = {\n \"empty_dir\": [],\n \"no_matching_files\": [\"a.pdf\", \"b.gif\"],\n \"some_matching_files\": [\"c.txt\", \"d.docx\", \"e.png\"],\n \"all_matching_files\": [\"f.txt\", \"g.docx\", \"h.xlsx\", \"i.csv\"],\n \"nested_dir\": [\"nested/j.txt\", \"nested/k.docx\", \"nested/l.png\"],\n \"deeply_nested_dir\": [\"deep/nested/m.xlsx\", \"deep/nested/n.csv\"],\n \"mixed_extensions\": [\"o.txt\", \"p.docx\", \"q.unknown\", \"r.csv\"],\n \"subdirs_with_files\": [\n \"subdir1/s.txt\",\n \"subdir2/t.xlsx\",\n \"subdir3/u.docx\",\n \"subdir2/v.csv\",\n ],\n }\n for dir_key, files in self.files_structure.items():\n if files:\n for file_path in files:\n full_path = os.path.join(self.test_source_dir, dir_key, file_path)\n os.makedirs(os.path.dirname(full_path), exist_ok=True)\n with open(full_path, \"w\") as f:\n f.write(\"dummy content\")\n else:\n os.makedirs(os.path.join(self.test_source_dir, dir_key), exist_ok=True)\n def tearDown(self):\n self.temp_source_dir.cleanup()\n self.temp_target_dir.cleanup()\n def zip_file_count(self, zip_path):\n extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n return sum(\n 1 for item in zip_ref.namelist() if Path(item).suffix in extensions\n )\n def test_case_1(self):\n # Test empty directory\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"empty_dir\"),\n self.test_target_dir,\n \"empty_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 0)\n def test_case_2(self):\n # Test no matching files\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"no_matching_files\"),\n self.test_target_dir,\n \"no_match_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 0)\n def test_case_3(self):\n # Test some matching files\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"some_matching_files\"),\n self.test_target_dir,\n \"some_match_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 2)\n def test_case_4(self):\n # Test all matching files\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"all_matching_files\"),\n self.test_target_dir,\n \"all_match_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 4)\n def test_case_5(self):\n # Test nested directory\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"nested_dir\"),\n self.test_target_dir,\n \"nested_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 2)\n def test_case_6(self):\n # Test mixed extension\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"mixed_extensions\"),\n self.test_target_dir,\n \"mixed_extensions_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 3)\n def test_case_7(self):\n # Test subdirectories with files\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"subdirs_with_files\"),\n self.test_target_dir,\n \"subdirs_with_files_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 4)", "apis": ["os.path.exists", "zipfile.ZipFile", "os.path", "pathlib.Path", "os.makedirs", "os.path.abspath", "os.path.join", "glob.glob"], "libs": ["glob", "zipfile", "os", "pathlib"], "doc": {"description": ["Zip files with certain extensions from a source directory and save it as a zip file", "saved to a target directory."], "note": ["The valid extensions are: ['.txt', '.docx', '.xlsx', '.csv']."], "params": ["source_directory (str): The source directory containing the files to be zipped.", "target_directory (str): The destination directory of the zip file to be created.", "If it does not exist, the function will create it.", "zip_name (str): The name of the zip file to create (without extension; '.zip' will be added automatically)."], "returns": ["str: The full path to the created zip file in the format \"/path/to/target_directory/zip_name.zip\"."], "reqs": ["os", "glob", "pathlib", "zipfile"], "raises": ["OSError: If the source_directory does not exist."], "example": [">>> path = f_806('/path/to/source_directory', '/path/to/target_directory', 'zipped_files')", ">>> type(path)", "", ">>> path", "'/path/to/target_directory/zipped_files.zip'"]}} -{"task_id": "f_350", "prompt": "import numpy as np\nfrom scipy.spatial import Voronoi, voronoi_plot_2d\nimport matplotlib.pyplot as plt\n\n\ndef f_350(points, seed=0):\n \"\"\"\n Calculate the Voronoi diagram for a number of points in 2D and plot it.\n Note: this function will raise errors when input is invalid, for example wrong type or shape.\n Jittering is applied prior to plotting.\n\n Parameters:\n - points (np.ndarray): A numpy ndarray of shape (n_points, 2) with the coordinates of the points.\n - seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n tuple (vor, ax): A tuple containing:\n - vor (Voronoi): A Voronoi object representing the Voronoi diagram of the points.\n - ax (Axes): The axes of the plotted Voronoi diagram.\n\n Requirements:\n - numpy\n - scipy\n - matplotlib.pyplot\n\n Example:\n >>> points = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])\n >>> vor, ax = f_350(points)\n >>> type(vor)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if not isinstance(points, np.ndarray):\n raise TypeError(\"Expected Numpy array\")\n if len(points) < 3:\n raise ValueError(\"Voronoi diagram needs at least 3 points\")\n if points.shape[-1] != 2:\n raise ValueError(\"Expected array of 2D points\")\n\n np.random.seed(seed)\n\n # Add a slight random jitter to the points\n jittered_points = points + np.random.normal(0, 1e-10, points.shape)\n\n vor = Voronoi(jittered_points)\n fig, ax = plt.subplots()\n voronoi_plot_2d(vor, ax=ax)\n\n return vor, ax", "test": "import unittest\nimport numpy as np\nfrom scipy.spatial import Voronoi\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.points = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])\n def test_case_1(self):\n # Standard tests\n vor, ax = f_350(self.points)\n self._run_test(self.points, vor, ax)\n def test_case_2(self):\n # Test random seed\n vor, _ = f_350(self.points, seed=0)\n vor1, _ = f_350(self.points, seed=0)\n vor2, _ = f_350(self.points, seed=1)\n self.assertTrue((vor.ridge_points == vor1.ridge_points).all())\n self.assertFalse((vor1.ridge_points == vor2.ridge_points).all())\n def test_case_3(self):\n # Test with points that are extremely close to each other\n points = np.array([[0, 0], [0, 1e-12], [1, 0]])\n vor, ax = f_350(points)\n self._run_test(points, vor, ax)\n def test_case_4(self):\n # Test with fewer than three points, which is the minimum to form a Voronoi diagram.\n points = np.array([[0, 0], [1, 1]])\n with self.assertRaises(Exception):\n f_350(points)\n def test_case_5(self):\n # Test with invalid input shapes, such as one-dimensional array.\n points = np.array([1, 2, 3])\n with self.assertRaises(Exception):\n f_350(points)\n def test_case_6(self):\n # Test with invalid input types\n with self.assertRaises(Exception):\n f_350(\"Not valid points\")\n def _run_test(self, points, vor, ax):\n # Check the point_region attribute of Voronoi object\n self.assertIsInstance(vor, Voronoi)\n self.assertEqual(len(vor.point_region), len(points))\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(len(ax.get_children()) > 0, \"The plot should have elements.\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.ndarray", "scipy.spatial.voronoi_plot_2d", "numpy.random", "numpy.random.seed", "scipy.spatial.Voronoi", "numpy.random.normal", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Calculate the Voronoi diagram for a number of points in 2D and plot it."], "note": ["this function will raise errors when input is invalid, for example wrong type or shape.", "Jittering is applied prior to plotting."], "params": ["points (np.ndarray): A numpy ndarray of shape (n_points, 2) with the coordinates of the points.", "seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["tuple (vor, ax): A tuple containing:", "vor (Voronoi): A Voronoi object representing the Voronoi diagram of the points.", "ax (Axes): The axes of the plotted Voronoi diagram."], "reqs": ["numpy", "scipy", "matplotlib.pyplot"], "raises": [], "example": [">>> points = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])", ">>> vor, ax = f_350(points)", ">>> type(vor)", "", ">>> type(ax)", ""]}} -{"task_id": "f_414", "prompt": "import json\nimport pandas as pd\nimport numpy as np\nfrom collections import defaultdict\n\n\ndef f_414(input_file=\"data.json\"):\n \"\"\"\n Read a list of dictionaries from a JSON file, calculate the mean and median for each key\n (ignoring non-numeric or missing values), and convert the results into a Pandas DataFrame.\n\n Parameters:\n - input_file (str, optional): The input JSON file name. Defaults to 'data.json'.\n The file should contain a list of dictionaries. If a key is\n missing in a dictionary, it is treated as NaN for that record.\n Non-numeric values are ignored for the calculation of mean\n and median. If all values for a key are non-numeric or missing,\n the statistics for that key will be NaN.\n\n Returns:\n - df (pd.DataFrame): A DataFrame indexed and sorted by the variable names (keys) from the\n input data, containing columns 'mean' and 'median'.\n\n Requirements:\n - numpy\n - collections\n - json\n - pandas\n\n Example:\n >>> df = f_414('data_1.json')\n a mean median\n b mean median\n c mean median\n \"\"\"", "canonical_solution": " with open(input_file, \"r\") as f:\n data = json.load(f)\n\n all_keys = set().union(*(d.keys() for d in data))\n stats = defaultdict(list)\n for d in data:\n for key in all_keys:\n value = d.get(key, np.nan)\n if isinstance(value, (int, float)):\n stats[key].append(value)\n else:\n stats[key].append(np.nan)\n\n result = {\n k: {\"mean\": np.nanmean(v), \"median\": np.nanmedian(v)} for k, v in stats.items()\n }\n df = pd.DataFrame(result).transpose().sort_index()\n\n return df", "test": "import unittest\nimport numpy as np\nimport tempfile\nimport json\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.temp_dir = tempfile.TemporaryDirectory()\n cls.test_data_paths = []\n test_data = [\n [{\"a\": 2, \"b\": 3, \"c\": 4}], # Test data for test_case_1\n [{\"a\": 1}], # Test data for test_case_2\n [{\"a\": 1.5}, {\"b\": None}], # Test data for test_case_3\n [], # Test data for test_case_4\n [{\"a\": 1.5, \"c\": 4}, {\"b\": None}], # Test data for test_case_5\n ]\n for idx, data in enumerate(test_data, start=1):\n path = cls.temp_dir.name + f\"/test_data_{idx}.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n cls.test_data_paths.append(path)\n def test_case_1(self):\n # Basic test\n df = f_414(self.test_data_paths[0])\n self.assertListEqual(df.index.tolist(), [\"a\", \"b\", \"c\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 2.0)\n self.assertAlmostEqual(df.loc[\"a\", \"median\"], 2.0)\n def test_case_2(self):\n # Test with a single key\n df = f_414(self.test_data_paths[1])\n self.assertListEqual(df.index.tolist(), [\"a\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 1.0)\n self.assertAlmostEqual(df.loc[\"a\", \"median\"], 1.0)\n def test_case_3(self):\n # Test with missing values to ensure handling of NaN\n df = f_414(self.test_data_paths[2])\n self.assertListEqual(df.index.tolist(), [\"a\", \"b\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 1.5)\n self.assertAlmostEqual(df.loc[\"a\", \"median\"], 1.5)\n self.assertTrue(np.isnan(df.loc[\"b\", \"mean\"]))\n self.assertTrue(np.isnan(df.loc[\"b\", \"median\"]))\n def test_case_4(self):\n # Test empty dataframe creation from an empty input file\n df = f_414(self.test_data_paths[3])\n self.assertEqual(df.shape[0], 0)\n def test_case_5(self):\n # Test handling of mixed data, including valid values and NaN\n df = f_414(self.test_data_paths[4])\n self.assertListEqual(df.index.tolist(), [\"a\", \"b\", \"c\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 1.5)\n self.assertAlmostEqual(df.loc[\"a\", \"median\"], 1.5)\n self.assertTrue(np.isnan(df.loc[\"b\", \"mean\"]))\n self.assertTrue(np.isnan(df.loc[\"b\", \"median\"]))\n self.assertAlmostEqual(df.loc[\"c\", \"mean\"], 4.0)\n self.assertAlmostEqual(df.loc[\"c\", \"median\"], 4.0)\n def test_case_6(self):\n # Test with mixed types in values\n data = [{\"a\": 5, \"b\": \"text\", \"c\": 7}, {\"a\": \"more text\", \"b\": 4, \"c\": None}]\n path = self.temp_dir.name + \"/test_data_6.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n df = f_414(path)\n self.assertListEqual(df.index.tolist(), [\"a\", \"b\", \"c\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 5.0)\n self.assertAlmostEqual(df.loc[\"c\", \"mean\"], 7.0)\n self.assertAlmostEqual(df.loc[\"b\", \"mean\"], 4.0)\n def test_case_7(self):\n # Test a larger dataset with missing values\n data = [{\"a\": i, \"b\": i * 2 if i % 2 == 0 else None} for i in range(1, 101)]\n path = self.temp_dir.name + \"/test_data_7.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n df = f_414(path)\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 50.5)\n self.assertAlmostEqual(\n df.loc[\"b\", \"mean\"], np.mean([2 * i for i in range(2, 101, 2)])\n )\n def test_case_8(self):\n # Test with all non-numeric values for a key\n data = [\n {\"a\": \"text\", \"b\": \"more text\"},\n {\"a\": \"even more text\", \"b\": \"still more text\"},\n ]\n path = self.temp_dir.name + \"/test_data_8.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n df = f_414(path)\n self.assertTrue(np.isnan(df.loc[\"a\", \"mean\"]))\n self.assertTrue(np.isnan(df.loc[\"b\", \"mean\"]))\n def test_case_9(self):\n # Test varying numbers of missing and non-numeric values\n data = [\n {\"a\": 10, \"b\": 20, \"c\": \"ignore\"},\n {\"a\": None, \"b\": 25, \"c\": 30},\n {\"a\": 5, \"b\": \"ignore\", \"c\": \"ignore\"},\n ]\n path = self.temp_dir.name + \"/test_data_9.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n df = f_414(path)\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 7.5)\n self.assertAlmostEqual(df.loc[\"b\", \"mean\"], 22.5)\n self.assertAlmostEqual(df.loc[\"c\", \"mean\"], 30.0)\n @classmethod\n def tearDownClass(cls):\n cls.temp_dir.cleanup()", "apis": ["numpy.nanmedian", "numpy.nan", "json.load", "collections.defaultdict", "pandas.DataFrame", "numpy.nanmean"], "libs": ["numpy", "collections", "json", "pandas"], "doc": {"description": ["Read a list of dictionaries from a JSON file, calculate the mean and median for each key", "(ignoring non-numeric or missing values), and convert the results into a Pandas DataFrame."], "note": [], "params": ["input_file (str, optional): The input JSON file name. Defaults to 'data.json'.", "The file should contain a list of dictionaries. If a key is", "missing in a dictionary, it is treated as NaN for that record.", "Non-numeric values are ignored for the calculation of mean", "and median. If all values for a key are non-numeric or missing,", "the statistics for that key will be NaN."], "returns": ["df (pd.DataFrame): A DataFrame indexed and sorted by the variable names (keys) from the", "input data, containing columns 'mean' and 'median'."], "reqs": ["numpy", "collections", "json", "pandas"], "raises": [], "example": [">>> df = f_414('data_1.json')", "a mean median", "b mean median", "c mean median"]}} -{"task_id": "f_890", "prompt": "from datetime import datetime\nimport pandas as pd\nfrom itertools import product\n\n# Constants\nEMPLOYEES = [\"John\", \"Alice\", \"Bob\", \"Charlie\", \"Dave\"]\n\n\ndef f_890(date_str):\n \"\"\"\n Generate a Pandas DataFrame containing a series of dates for a predefined list of employees.\n\n Parameters:\n - date_str (str): A date string in the \"yyyy-mm-dd\" format to define the starting date.\n\n Returns:\n - DataFrame: A pandas DataFrame with 'Employee' and 'Date' columns, listing the next 10 days for each employee.\n\n Requirements:\n - datetime.datetime\n - pandas\n - itertools\n\n Example:\n >>> df = f_890('2023-06-15')\n >>> print(df)\n Employee Date\n 0 John 2023-06-15\n 1 John 2023-06-16\n ...\n 49 Dave 2023-06-24\n \"\"\"", "canonical_solution": " start_date = datetime.strptime(date_str, \"%Y-%m-%d\")\n dates = pd.date_range(start_date, periods=10).tolist()\n\n # Creating a DataFrame from the product of EMPLOYEES and dates\n df = pd.DataFrame(list(product(EMPLOYEES, dates)), columns=[\"Employee\", \"Date\"])\n\n return df", "test": "import unittest\nimport pandas as pd\nfrom datetime import datetime, timedelta\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n def test_return_type(self):\n \"\"\"Test if the function returns a Pandas DataFrame.\"\"\"\n df_test = f_890(\"2023-01-01\")\n self.assertIsInstance(df_test, pd.DataFrame)\n def test_correct_columns(self):\n \"\"\"Test if the DataFrame has the correct columns: 'Employee' and 'Date'.\"\"\"\n df_test = f_890(\"2023-01-01\")\n self.assertListEqual(df_test.columns.tolist(), [\"Employee\", \"Date\"])\n def test_date_range(self):\n \"\"\"Test if the function generates the correct date range for 10 days.\"\"\"\n start_date = \"2023-01-01\"\n df_test = f_890(start_date)\n end_date = (\n datetime.strptime(start_date, \"%Y-%m-%d\") + timedelta(days=9)\n ).date()\n self.assertTrue(all(df_test[\"Date\"] <= pd.Timestamp(end_date)))\n def test_number_of_rows(self):\n \"\"\"Test if the DataFrame has the correct number of rows (10 days * number of employees).\"\"\"\n df_test = f_890(\"2023-01-01\")\n expected_rows = 10 * len(EMPLOYEES) # 10 days for each employee\n self.assertEqual(len(df_test), expected_rows)\n def test_leap_year(self):\n \"\"\"Test if the function correctly handles the date range for a leap year.\"\"\"\n df_test = f_890(\"2024-02-28\")\n leap_year_end_date = (\n datetime.strptime(\"2024-02-28\", \"%Y-%m-%d\") + timedelta(days=9)\n ).date()\n self.assertIn(pd.Timestamp(leap_year_end_date), df_test[\"Date\"].values)", "apis": ["pandas.date_range", "pandas.DataFrame", "datetime.datetime.strptime", "itertools.product"], "libs": ["pandas", "datetime", "itertools"], "doc": {"description": ["Generate a Pandas DataFrame containing a series of dates for a predefined list of employees."], "note": [], "params": ["date_str (str): A date string in the \"yyyy-mm-dd\" format to define the starting date."], "returns": ["DataFrame: A pandas DataFrame with 'Employee' and 'Date' columns, listing the next 10 days for each employee."], "reqs": ["datetime.datetime", "pandas", "itertools"], "raises": [], "example": [">>> df = f_890('2023-06-15')", ">>> print(df)", "Employee Date", "0 John 2023-06-15", "1 John 2023-06-16", "...", "49 Dave 2023-06-24"]}} -{"task_id": "f_402", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_402(array):\n \"\"\"\n Create a Pandas DataFrame from a 2D list and plot the sum of each column.\n\n Parameters:\n array (list of list of int): The 2D list representing the data.\n\n Returns:\n DataFrame, Axes: A pandas DataFrame with the data and a matplotlib Axes object showing the sum of each column.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Internal Constants:\n COLUMNS: List of column names used for the DataFrame ['A', 'B', 'C', 'D', 'E']\n\n Example:\n >>> df, ax = f_402([[1,2,3,4,5], [6,7,8,9,10]])\n >>> print(df)\n A B C D E\n 0 1 2 3 4 5\n 1 6 7 8 9 10\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " # Internal Constants\n COLUMNS = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n\n df = pd.DataFrame(array, columns=COLUMNS)\n sums = df.sum()\n\n fig, ax = plt.subplots()\n sums.plot(kind=\"bar\", ax=ax)\n\n return df, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df, ax = f_402([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.assertEqual(df.values.tolist(), [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.assertEqual(df.columns.tolist(), [\"A\", \"B\", \"C\", \"D\", \"E\"])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_2(self):\n df, ax = f_402(\n [[10, 20, 30, 40, 50], [15, 25, 35, 45, 55], [5, 15, 25, 35, 45]]\n )\n self.assertEqual(\n df.values.tolist(),\n [[10, 20, 30, 40, 50], [15, 25, 35, 45, 55], [5, 15, 25, 35, 45]],\n )\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_3(self):\n # Test handling uniform data\n df, ax = f_402([[1, 1, 1, 1, 1]])\n self.assertEqual(df.values.tolist(), [[1, 1, 1, 1, 1]])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_4(self):\n # Test handling all zero\n df, ax = f_402([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])\n self.assertEqual(df.values.tolist(), [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_5(self):\n # Handle negatives\n df, ax = f_402([[-1, -2, -3, -4, -5], [1, 2, 3, 4, 5]])\n self.assertEqual(df.values.tolist(), [[-1, -2, -3, -4, -5], [1, 2, 3, 4, 5]])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_6(self):\n # Handle empty\n df, ax = f_402([])\n self.assertEqual(df.values.tolist(), [])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_7(self):\n # Handle invalid input\n with self.assertRaises(TypeError):\n f_402([[\"a\", \"b\", \"c\", \"d\", \"e\"]])\n def test_case_8(self):\n # Handle large numbers\n df, _ = f_402([[1000000, 2000000, 3000000, 4000000, 5000000]])\n self.assertTrue(\n all(\n df.sum()\n == pd.Series(\n [1000000, 2000000, 3000000, 4000000, 5000000],\n index=[\"A\", \"B\", \"C\", \"D\", \"E\"],\n )\n )\n )\n def test_case_9(self):\n # Test plot details\n _, ax = f_402([[1, 2, 3, 4, 5]])\n self.assertEqual(len(ax.patches), 5) # Checks if there are exactly 5 bars\n bar_labels = [bar.get_x() for bar in ax.patches]\n self.assertEqual(len(bar_labels), 5)\n def test_case_10(self):\n # Test column sums with plot check\n data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [2, 3, 4, 5, 6]]\n df, ax = f_402(data)\n column_sums = df.sum().tolist()\n bar_heights = [bar.get_height() for bar in ax.patches]\n self.assertEqual(column_sums, bar_heights)\n self.assertEqual(\n len(ax.patches), len(data[0])\n ) # Ensure there's a bar for each column\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Create a Pandas DataFrame from a 2D list and plot the sum of each column.", "Internal Constants:", "COLUMNS: List of column names used for the DataFrame ['A', 'B', 'C', 'D', 'E']"], "note": [], "params": ["array (list of list of int): The 2D list representing the data."], "returns": ["DataFrame, Axes: A pandas DataFrame with the data and a matplotlib Axes object showing the sum of each column."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> df, ax = f_402([[1,2,3,4,5], [6,7,8,9,10]])", ">>> print(df)", "A B C D E", "0 1 2 3 4 5", "1 6 7 8 9 10", ">>> type(ax)", ""]}} -{"task_id": "f_914", "prompt": "import pandas as pd\nfrom random import shuffle\n\n# Constants\nPOSSIBLE_VALUES = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\", \"G\", \"H\", \"I\", \"J\"]\n\n\ndef f_914(list_of_lists):\n \"\"\"\n Generate a list of pandas DataFrames, each created from a sublist in 'list_of_lists'.\n Each DataFrame has columns named as per the elements of the sublist, and each column\n is filled with randomly shuffled values from 'POSSIBLE_VALUES'.\n\n Parameters:\n - list_of_lists (list of list): A list where each element is a list of strings\n representing column names for a DataFrame.\n\n Returns:\n - list of pandas.DataFrame: A list where each element is a DataFrame with columns as specified\n in 'list_of_lists', and each column contains shuffled values from 'POSSIBLE_VALUES'.\n\n Requirements:\n - pandas\n - random.shuffle\n\n Note:\n - The length of each DataFrame's columns is equal to the length of 'POSSIBLE_VALUES'.\n - Each column in the DataFrame has the same shuffled order of 'POSSIBLE_VALUES'.\n\n Example:\n >>> import random\n >>> random.seed(0)\n >>> dfs = f_914([['x', 'y', 'z'], ['a', 'b', 'c']])\n >>> dfs[0].head()\n x y z\n 0 H J H\n 1 I E A\n 2 B I J\n 3 F G D\n 4 D A C\n \"\"\"", "canonical_solution": " dataframes = []\n\n for list_ in list_of_lists:\n df_dict = {col: POSSIBLE_VALUES.copy() for col in list_}\n for col in df_dict:\n shuffle(df_dict[col])\n df = pd.DataFrame(df_dict)\n dataframes.append(df)\n\n return dataframes", "test": "import unittest\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_914 function.\"\"\"\n def test_dataframe_count(self):\n \"\"\"Test number of dataframes returned.\"\"\"\n random.seed(0)\n input_data = [[\"x\", \"y\"], [\"a\", \"b\", \"c\"], [\"m\"]]\n dfs = f_914(input_data)\n self.assertEqual(len(dfs), len(input_data))\n def test_dataframe_columns(self):\n \"\"\"Test each dataframe has correct columns.\"\"\"\n random.seed(1)\n input_data = [[\"x\", \"y\"], [\"a\", \"b\", \"c\"], [\"m\"]]\n dfs = f_914(input_data)\n for idx, df in enumerate(dfs):\n self.assertListEqual(list(df.columns), input_data[idx])\n def test_dataframe_values(self):\n \"\"\"Test values in each dataframe column are from the POSSIBLE_VALUES list.\"\"\"\n random.seed(2)\n input_data = [[\"x\", \"y\"], [\"a\", \"b\", \"c\"], [\"m\"]]\n dfs = f_914(input_data)\n for df in dfs:\n for col in df.columns:\n self.assertTrue(all(val in POSSIBLE_VALUES for val in df[col].values))\n def test_empty_input(self):\n \"\"\"Test function with an empty list of lists.\"\"\"\n random.seed(3)\n dfs = f_914([])\n self.assertEqual(len(dfs), 0)\n def test_single_list_input(self):\n \"\"\"Test function with a single list input.\"\"\"\n random.seed(4)\n input_data = [[\"x\", \"y\", \"z\"]]\n dfs = f_914(input_data)\n self.assertEqual(len(dfs), 1)\n self.assertListEqual(list(dfs[0].columns), input_data[0])\n self.assertTrue(all(val in POSSIBLE_VALUES for val in dfs[0][\"x\"].values))\n self.assertTrue(all(val in POSSIBLE_VALUES for val in dfs[0][\"y\"].values))\n self.assertTrue(all(val in POSSIBLE_VALUES for val in dfs[0][\"z\"].values))", "apis": ["random.shuffle", "pandas.DataFrame"], "libs": ["pandas", "random"], "doc": {"description": ["Generate a list of pandas DataFrames, each created from a sublist in 'list_of_lists'.", "Each DataFrame has columns named as per the elements of the sublist, and each column", "is filled with randomly shuffled values from 'POSSIBLE_VALUES'."], "note": ["The length of each DataFrame's columns is equal to the length of 'POSSIBLE_VALUES'.", "Each column in the DataFrame has the same shuffled order of 'POSSIBLE_VALUES'."], "params": ["list_of_lists (list of list): A list where each element is a list of strings", "representing column names for a DataFrame."], "returns": ["list of pandas.DataFrame: A list where each element is a DataFrame with columns as specified", "in 'list_of_lists', and each column contains shuffled values from 'POSSIBLE_VALUES'."], "reqs": ["pandas", "random.shuffle"], "raises": [], "example": [">>> import random", ">>> random.seed(0)", ">>> dfs = f_914([['x', 'y', 'z'], ['a', 'b', 'c']])", ">>> dfs[0].head()", "x y z", "0 H J H", "1 I E A", "2 B I J", "3 F G D", "4 D A C"]}} -{"task_id": "f_773", "prompt": "from collections import defaultdict\nimport re\n\ndef f_773(word: str) -> dict:\n \"\"\"\n Find the occurrences of each two-letter combination in the sanitized word,\n where only alphabetic characters are considered.\n\n Requirements:\n - collections.defaultdict\n - re\n \n Parameters:\n word (str): The input string.\n\n Returns:\n collections.defaultdict: A dictionary with keys as two-letter combinations and values as their counts in the sanitized word.\n\n Example:\n >>> f_773('abcdef')\n defaultdict(, {'ab': 1, 'bc': 1, 'cd': 1, 'de': 1, 'ef': 1})\n >>> f_773('aabbcc')\n defaultdict(, {'aa': 1, 'ab': 1, 'bb': 1, 'bc': 1, 'cc': 1})\n >>> f_773('a1!b@c#d$')\n defaultdict(, {'ab': 1, 'bc': 1, 'cd': 1})\n \"\"\"", "canonical_solution": " # Sanitize the word to include only alphabetic characters\n sanitized_word = re.sub('[^A-Za-z]', '', word)\n occurrences = defaultdict(int)\n pairs = [''.join(x) for x in zip(sanitized_word, sanitized_word[1:])]\n\n for pair in pairs:\n occurrences[pair] += 1\n\n return occurrences", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n result = f_773('abcdef')\n expected = {'ab': 1, 'bc': 1, 'cd': 1, 'de': 1, 'ef': 1}\n self.assertEqual(result, expected)\n def test_case_2(self):\n result = f_773('aabbcc')\n expected = {'aa': 1, 'ab': 1, 'bb': 1, 'bc': 1, 'cc': 1}\n self.assertEqual(result, expected)\n def test_case_3(self):\n result = f_773('a')\n expected = {}\n self.assertEqual(result, expected)\n def test_case_4(self):\n result = f_773('')\n expected = {}\n self.assertEqual(result, expected)\n def test_case_5(self):\n result = f_773('AbCd')\n expected = {'Ab': 1, 'bC': 1, 'Cd': 1}\n self.assertEqual(result, expected)\n def test_case_6(self):\n # Test with non-alphabetic characters in the word\n result = f_773('a1!b@c#d$')\n expected = {'ab': 1, 'bc': 1, 'cd': 1}\n self.assertEqual(result, expected)\n def test_case_7(self):\n # Test with mixed case and non-alphabetic characters\n result = f_773('AaBb!!Cc123')\n expected = {'Aa': 1, 'aB': 1, 'Bb': 1, 'bC': 1, 'Cc': 1}\n self.assertEqual(result, expected)", "apis": ["collections.defaultdict", "re.sub"], "libs": ["collections", "re"], "doc": {"description": ["Find the occurrences of each two-letter combination in the sanitized word,", "where only alphabetic characters are considered."], "note": [], "params": ["word (str): The input string."], "returns": ["collections.defaultdict: A dictionary with keys as two-letter combinations and values as their counts in the sanitized word."], "reqs": ["collections.defaultdict", "re"], "raises": [], "example": [">>> f_773('abcdef')", "defaultdict(, {'ab': 1, 'bc': 1, 'cd': 1, 'de': 1, 'ef': 1})", ">>> f_773('aabbcc')", "defaultdict(, {'aa': 1, 'ab': 1, 'bb': 1, 'bc': 1, 'cc': 1})", ">>> f_773('a1!b@c#d$')", "defaultdict(, {'ab': 1, 'bc': 1, 'cd': 1})"]}} +{"task_id": "f_885", "prompt": "import re\nimport os\n\n\ndef f_885(request):\n \"\"\"\n Handles an HTTP GET request to retrieve a static file from the server.\n\n This function processes an HTTP GET request, extracts the filename from it, checks the existence of the file\n in the server's directory, and returns an HTTP response. The response either contains the file content (if found) or an\n appropriate error message (if not found or if the request is invalid).\n\n Parameters:\n - request (str): An HTTP GET request in string format. The expected format is \"GET / HTTP/1.1\".\n\n Returns:\n - str: An HTTP response string, which includes the status code, content length (for 200 OK responses), and the file content\n or an error message.\n\n Requirements:\n - os\n - re\n\n Examples:\n >>> f_885(\"GET /test.txt HTTP/1.1\")\n \"HTTP/1.1 200 OK\\r\\nContent-Length: \\r\\n\\r\\n\"\n >>> f_885(\"GET /nonexistent.txt HTTP/1.1\")\n \"HTTP/1.1 404 NOT FOUND\\r\\n\\r\\nFile Not Found\"\n >>> f_885(\"INVALID REQUEST\")\n \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n >>> f_885(\"GET /restricted.txt HTTP/1.1\") # Assuming an I/O error occurs\n \"HTTP/1.1 500 INTERNAL SERVER ERROR\\r\\n\\r\\nInternal Server Error\"\n \"\"\"", "canonical_solution": " match = re.match(r\"^GET /([\\w\\.\\-]+) HTTP/1\\.1$\", request)\n if match:\n file_name = match.group(1)\n if os.path.exists(file_name):\n try:\n with open(file_name, \"rb\") as file:\n content = file.read()\n response = f\"HTTP/1.1 200 OK\\r\\nContent-Length: {len(content)}\\r\\n\\r\\n{content.decode('utf-8')}\"\n except Exception:\n response = (\n \"HTTP/1.1 500 INTERNAL SERVER ERROR\\r\\n\\r\\nInternal Server Error\"\n )\n else:\n response = \"HTTP/1.1 404 NOT FOUND\\r\\n\\r\\nFile Not Found\"\n else:\n response = \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n\n return response", "test": "import unittest\nimport re\nimport os\nfrom unittest.mock import mock_open, patch\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_885 function.\"\"\"\n def setUp(self):\n \"\"\"Set up the environment for testing by creating test files.\"\"\"\n with open(\"test.txt\", \"w\", encoding=\"utf-8\") as f:\n f.write(\"This is a test file.\")\n def tearDown(self):\n \"\"\"Clean up the environment by deleting the test files created.\"\"\"\n os.remove(\"test.txt\")\n def test_file_found(self):\n \"\"\"Test the response when the requested file is found.\"\"\"\n request = \"GET /test.txt HTTP/1.1\"\n expected_response = (\n \"HTTP/1.1 200 OK\\r\\nContent-Length: 20\\r\\n\\r\\nThis is a test file.\"\n )\n self.assertEqual(f_885(request), expected_response)\n def test_file_not_found(self):\n \"\"\"Test the response when the requested file is not found.\"\"\"\n request = \"GET /nonexistent.txt HTTP/1.1\"\n expected_response = \"HTTP/1.1 404 NOT FOUND\\r\\n\\r\\nFile Not Found\"\n self.assertEqual(f_885(request), expected_response)\n def test_bad_request(self):\n \"\"\"Test the response for a badly formatted request.\"\"\"\n request = \"BAD REQUEST\"\n expected_response = \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n self.assertEqual(f_885(request), expected_response)\n def test_empty_request(self):\n \"\"\"Test the response for an empty request.\"\"\"\n request = \"\"\n expected_response = \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n self.assertEqual(f_885(request), expected_response)\n def test_invalid_method_request(self):\n \"\"\"Test the response for a request with an invalid HTTP method.\"\"\"\n request = \"POST /test.txt HTTP/1.1\"\n expected_response = \"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"\n self.assertEqual(f_885(request), expected_response)\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"data\")\n def test_internal_server_error(self, mock_file):\n \"\"\"Test the response when there's an internal server error (e.g., file read error).\"\"\"\n mock_file.side_effect = Exception(\"Mocked exception\")\n request = \"GET /test.txt HTTP/1.1\"\n expected_response = (\n \"HTTP/1.1 500 INTERNAL SERVER ERROR\\r\\n\\r\\nInternal Server Error\"\n )\n self.assertEqual(f_885(request), expected_response)", "apis": ["re.match", "os.path.exists", "os.path"], "libs": ["re", "os"], "doc": {"description": ["Handles an HTTP GET request to retrieve a static file from the server.", "This function processes an HTTP GET request, extracts the filename from it, checks the existence of the file", "in the server's directory, and returns an HTTP response. The response either contains the file content (if found) or an", "appropriate error message (if not found or if the request is invalid)."], "note": [], "params": ["request (str): An HTTP GET request in string format. The expected format is \"GET / HTTP/1.1\"."], "returns": ["str: An HTTP response string, which includes the status code, content length (for 200 OK responses), and the file content", "or an error message."], "reqs": ["os", "re"], "raises": [], "example": ["Examples:", ">>> f_885(\"GET /test.txt HTTP/1.1\")", "\"HTTP/1.1 200 OK\\r\\nContent-Length: \\r\\n\\r\\n\"", ">>> f_885(\"GET /nonexistent.txt HTTP/1.1\")", "\"HTTP/1.1 404 NOT FOUND\\r\\n\\r\\nFile Not Found\"", ">>> f_885(\"INVALID REQUEST\")", "\"HTTP/1.1 400 BAD REQUEST\\r\\n\\r\\nBad Request\"", ">>> f_885(\"GET /restricted.txt HTTP/1.1\") # Assuming an I/O error occurs", "\"HTTP/1.1 500 INTERNAL SERVER ERROR\\r\\n\\r\\nInternal Server Error\""]}} +{"task_id": "f_388", "prompt": "import random\nfrom datetime import datetime\nimport matplotlib.pyplot as plt\n\ndef f_388(epoch_milliseconds, seed=None):\n \"\"\"\n Generate and draw a sales trend for different categories from a particular epoch milliseconds\n to the current time.\n\n The function selects category from ['Electronics', 'Clothing', 'Home', 'Books', 'Sports'].\n Each day's sales are randomly determined between 10 and 50 units for each category.\n The plot's x-axis represents 'Days since (the start date)', and the y-axis represents 'Sales' units.\n\n Parameters:\n - epoch_milliseconds (int): Start time. Must be positive and before current time.\n - seed (int, optional): Seed for random number generation. Default is None (no seed).\n\n Returns:\n - sales_data (dict): Sales data for different categories over days.\n - ax (plt.Axes): The plot depicting the sales trend.\n\n Requirements:\n - random\n - datetime.datetime\n - matplotlib\n\n Example:\n >>> random.seed(42)\n >>> sales_data, ax = f_388(1236472051807, seed=42)\n >>> type(sales_data)\n \n >>> list(sales_data['Electronics'])[:3]\n [50, 24, 47]\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " CATEGORIES = [\"Electronics\", \"Clothing\", \"Home\", \"Books\", \"Sports\"]\n\n if seed is not None:\n random.seed(seed)\n\n if epoch_milliseconds < 0:\n raise ValueError(\"Start time cannot be negative.\")\n\n start_time = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n current_time = datetime.now()\n days_diff = (current_time - start_time).days\n if days_diff <= 0:\n raise ValueError(\"Start date must be before current time.\")\n\n sales_data = {category: [0] * days_diff for category in CATEGORIES}\n\n for i in range(days_diff):\n for category in CATEGORIES:\n sales = random.randint(10, 50)\n sales_data[category][i] += sales\n\n fig, ax = plt.subplots()\n for category, sales in sales_data.items():\n ax.plot(range(days_diff), sales, label=category)\n\n ax.set_xlabel(\"Days since \" + start_time.strftime(\"%Y-%m-%d %H:%M:%S\"))\n ax.set_ylabel(\"Sales\")\n ax.legend()\n\n return sales_data, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nfrom datetime import datetime\nfrom datetime import timedelta\nclass TestCases(unittest.TestCase):\n def _check_sales_data(self, sales_data, expected_days):\n \"\"\"Utility function to validate sales data.\"\"\"\n self.assertIsInstance(sales_data, dict)\n self.assertEqual(\n set(sales_data.keys()),\n set([\"Electronics\", \"Clothing\", \"Home\", \"Books\", \"Sports\"]),\n )\n for category, sales in sales_data.items():\n self.assertEqual(len(sales), expected_days)\n for sale in sales:\n self.assertGreaterEqual(sale, 10)\n self.assertLessEqual(sale, 50)\n def test_case_1(self):\n # Basic test on manual example - Jan 1 2021\n sales_data, ax = f_388(1609459200000, seed=1)\n self.assertIsInstance(sales_data, dict)\n self.assertIsInstance(ax, plt.Axes)\n self._check_sales_data(\n sales_data,\n (datetime.now() - datetime.fromtimestamp(1609459200000 / 1000.0)).days,\n )\n self.assertEqual(ax.get_ylabel(), \"Sales\")\n def test_case_2(self):\n # Basic test on current date - should raise error\n current_epoch = int(datetime.now().timestamp() * 1000)\n with self.assertRaises(ValueError):\n f_388(current_epoch, seed=2)\n def test_case_3(self):\n # Test random seed\n t = 1609459200000\n sales_data1, _ = f_388(t, seed=42)\n sales_data2, _ = f_388(t, seed=42)\n sales_data3, _ = f_388(t, seed=3)\n self.assertEqual(sales_data1, sales_data2)\n self.assertNotEqual(sales_data1, sales_data3)\n def test_case_4(self):\n # Test that future date raises ValueError\n future_epoch = int((datetime.now() + timedelta(days=1)).timestamp() * 1000)\n with self.assertRaises(ValueError):\n f_388(future_epoch, seed=4)\n def test_case_5(self):\n # Test that negative epoch milliseconds raise an error\n with self.assertRaises(ValueError):\n f_388(-1609459200000, seed=5)\n def test_case_6(self):\n # Test that non-integer types for epoch milliseconds raise a TypeError\n with self.assertRaises(TypeError):\n f_388(\"1609459200000\", seed=6)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["datetime.datetime.fromtimestamp", "datetime.datetime.now", "random.randint", "random.seed", "matplotlib.pyplot.subplots"], "libs": ["random", "matplotlib", "datetime"], "doc": {"description": ["Generate and draw a sales trend for different categories from a particular epoch milliseconds", "to the current time.", "The function selects category from ['Electronics', 'Clothing', 'Home', 'Books', 'Sports'].", "Each day's sales are randomly determined between 10 and 50 units for each category.", "The plot's x-axis represents 'Days since (the start date)', and the y-axis represents 'Sales' units."], "note": [], "params": ["epoch_milliseconds (int): Start time. Must be positive and before current time.", "seed (int, optional): Seed for random number generation. Default is None (no seed)."], "returns": ["sales_data (dict): Sales data for different categories over days.", "ax (plt.Axes): The plot depicting the sales trend."], "reqs": ["random", "datetime.datetime", "matplotlib"], "raises": [], "example": [">>> random.seed(42)", ">>> sales_data, ax = f_388(1236472051807, seed=42)", ">>> type(sales_data)", "", ">>> list(sales_data['Electronics'])[:3]", "[50, 24, 47]", ">>> type(ax)", ""]}} +{"task_id": "f_535", "prompt": "import pandas as pd\nimport os\n\ndef f_535(filename):\n \"\"\"\n Read a CSV file of pandas, reverse the order of the lines and write the inverted lines back into the file. Then move the cursor back to the beginning of the file. \n The header should not be inverted and the file may be empty.\n\n Parameters:\n - filename (str): The name of the CSV file.\n\n Returns:\n - filename (str): The name of the CSV file.\n\n Requirements:\n - os\n - pandas\n\n Example:\n >>> f_535('file.csv')\n 'file.csv'\n \"\"\"", "canonical_solution": " if not os.path.exists(filename):\n return filename\n\n # Check if empty\n with open(filename, 'r') as file:\n if not file.read(1):\n return filename\n\n df = pd.read_csv(filename)\n df = df.iloc[::-1]\n df.to_csv(filename, index=False)\n\n with open(filename, 'r+') as file:\n file.seek(0)\n\n return filename", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def base(self, filename, contents, expected):\n # Create file\n with open(filename, 'w') as f:\n f.write(contents)\n # Run function\n f_535(filename)\n # Check file\n with open(filename, 'r') as f:\n self.assertEqual(f.read().strip(), expected.strip())\n # Remove file\n os.remove(filename)\n def test_case_1(self):\n self.base('file.csv', 'a,b,c\\n1,2,3\\n4,5,6\\n7,8,9', 'a,b,c\\n7,8,9\\n4,5,6\\n1,2,3')\n def test_case_2(self):\n self.base('file.csv', 'a,b,c\\n1,2,3\\n4,5,6', 'a,b,c\\n4,5,6\\n1,2,3')\n def test_case_3(self):\n self.base('file.csv', 'a,b,c\\n1,2,3', 'a,b,c\\n1,2,3')\n def test_case_4(self):\n self.base('file.csv', 'a,b,c', 'a,b,c')\n def test_case_5(self):\n self.base('file.csv', '', '')", "apis": ["pandas.read_csv", "os.path", "os.path.exists"], "libs": ["os", "pandas"], "doc": {"description": ["Read a CSV file of pandas, reverse the order of the lines and write the inverted lines back into the file. Then move the cursor back to the beginning of the file.", "The header should not be inverted and the file may be empty."], "note": [], "params": ["filename (str): The name of the CSV file."], "returns": ["filename (str): The name of the CSV file."], "reqs": ["os", "pandas"], "raises": [], "example": [">>> f_535('file.csv')", "'file.csv'"]}} +{"task_id": "f_609", "prompt": "import base64\nimport re\nfrom html import unescape\nimport textwrap\n\ndef f_609(raw_string, line_length):\n \"\"\"\n Decode a raw string from base64, decouple HTML entities, replace multiple spaces with a single space, strip leading and subsequent spaces, and wrap text to a certain line length.\n\n Parameters:\n - raw_string (str): The base64 encoded string.\n - line_length (int): The maximum length of a line.\n\n Returns:\n - wrapped_text (str): The cleaned and formatted string.\n\n Requirements:\n - base64\n - re\n - html\n - textwrap\n\n Example:\n >>> f_609('SGVsbG8sICBXb3JsZCEgICAg', 5)\n 'Hello\\\\n, Wor\\\\nld!'\n \"\"\"", "canonical_solution": "\n # Decode the string from base64\n decoded_string = base64.b64decode(raw_string).decode('utf-8')\n\n # Unescape HTML entities\n unescaped_string = unescape(decoded_string)\n\n # Replace multiple spaces with a single space and strip leading and trailing spaces\n cleaned_string = re.sub(' +', ' ', unescaped_string).strip()\n\n # Wrap the text\n wrapped_text = textwrap.fill(cleaned_string, line_length)\n\n return wrapped_text", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 5), 'Hello\\n, Wor\\nld!')\n def test_case_2(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 10), 'Hello,\\nWorld!')\n def test_case_3(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 20), 'Hello, World!')\n def test_case_4(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 1), 'H\\ne\\nl\\nl\\no\\n,\\nW\\no\\nr\\nl\\nd\\n!')\n def test_case_5(self):\n self.assertEqual(f_609('SGVsbG8sICBXb3JsZCEgICAg', 2), 'He\\nll\\no,\\nWo\\nrl\\nd!')", "apis": ["re.sub", "base64.b64decode", "textwrap.fill", "html.unescape"], "libs": ["base64", "re", "textwrap", "html"], "doc": {"description": ["Decode a raw string from base64, decouple HTML entities, replace multiple spaces with a single space, strip leading and subsequent spaces, and wrap text to a certain line length."], "note": [], "params": ["raw_string (str): The base64 encoded string.", "line_length (int): The maximum length of a line."], "returns": ["wrapped_text (str): The cleaned and formatted string."], "reqs": ["base64", "re", "html", "textwrap"], "raises": [], "example": [">>> f_609('SGVsbG8sICBXb3JsZCEgICAg', 5)", "'Hello\\\\n, Wor\\\\nld!'"]}} +{"task_id": "f_820", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndef f_820(array, features=None, seed=None):\n \"\"\"\n Shuffles the columns of a given 2D numpy array and visualizes it as a heatmap.\n\n Parameters:\n - array (ndarray): The 2D numpy array to shuffle and plot. It must not be empty.\n - features (list of str, optional): Custom labels for the columns after shuffling.\n If not specified, default numerical labels are used.\n The list must match the number of columns in 'array'.\n - seed (int, optional): Seed for the random number generator to ensure reproducibility of the shuffle.\n\n Returns:\n - Axes: The matplotlib Axes object containing the heatmap.\n\n Raises:\n - ValueError: If 'features' is provided and does not match the number of columns in 'array'; and\n if 'array' is empty or not 2-dimensional.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - seaborn\n\n Notes:\n - This function uses the features list as labels for the heatmap's x-axis if features is provided;\n otherwise, it defaults to strings of the numerical labels starting from 1 up to the number of\n columns in the array.\n\n Example:\n >>> np.random.seed(0)\n >>> array = np.random.rand(2, 5)\n >>> ax = f_820(array, features=['A', 'B', 'C', 'D', 'E'], seed=1)\n >>> type(ax)\n \n >>> ax.collections[0].get_array().data.flatten()\n array([0.60276338, 0.71518937, 0.4236548 , 0.5488135 , 0.54488318,\n 0.891773 , 0.43758721, 0.38344152, 0.64589411, 0.96366276])\n \"\"\"", "canonical_solution": "\n if seed is not None:\n np.random.seed(seed)\n\n if array.size == 0 or len(array.shape) != 2:\n raise ValueError(\"Input array must be 2-dimensional and non-empty.\")\n\n if features is not None and len(features) != array.shape[1]:\n raise ValueError(\"Features list must match the number of columns in the array.\")\n\n shuffled_array = np.random.permutation(array.T).T\n\n fig, ax = plt.subplots()\n sns.heatmap(\n shuffled_array,\n xticklabels=features if features is not None else np.arange(array.shape[1]) + 1,\n ax=ax,\n )\n\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n self.array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.expected_labels = [\"1\", \"2\", \"3\", \"4\", \"5\"]\n def test_default_features(self):\n \"\"\"Test heatmap with default features.\"\"\"\n ax = f_820(self.array)\n xticklabels = [tick.get_text() for tick in ax.get_xticklabels()]\n self.assertEqual(xticklabels, self.expected_labels)\n self.assertTrue(len(ax.collections), 1)\n def test_custom_features(self):\n \"\"\"Test heatmap with custom features.\"\"\"\n custom_labels = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n ax = f_820(self.array, features=custom_labels)\n xticklabels = [tick.get_text() for tick in ax.get_xticklabels()]\n self.assertEqual(xticklabels, custom_labels)\n self.assertTrue(len(ax.collections), 1)\n def test_features_mismatch(self):\n \"\"\"Test for error when features list does not match array dimensions.\"\"\"\n with self.assertRaises(ValueError):\n f_820(self.array, features=[\"A\", \"B\"])\n def test_seed_reproducibility(self):\n \"\"\"Test if seeding makes shuffling reproducible.\"\"\"\n ax1 = f_820(self.array, seed=42)\n ax2 = f_820(self.array, seed=42)\n heatmap_data1 = ax1.collections[0].get_array().data\n heatmap_data2 = ax2.collections[0].get_array().data\n np.testing.assert_array_equal(heatmap_data1, heatmap_data2)\n def test_empty_array(self):\n \"\"\"Test for handling an empty array.\"\"\"\n with self.assertRaises(ValueError):\n f_820(np.array([]))\n def tearDown(self):\n \"\"\"Cleanup plot figures after each test.\"\"\"\n plt.close(\"all\")", "apis": ["numpy.random.permutation", "seaborn.heatmap", "numpy.arange", "numpy.random", "matplotlib.pyplot.subplots", "numpy.random.seed"], "libs": ["seaborn", "numpy", "matplotlib"], "doc": {"description": ["Shuffles the columns of a given 2D numpy array and visualizes it as a heatmap.", "Notes:", "- This function uses the features list as labels for the heatmap's x-axis if features is provided;", "otherwise, it defaults to strings of the numerical labels starting from 1 up to the number of", "columns in the array."], "note": [], "params": ["array (ndarray): The 2D numpy array to shuffle and plot. It must not be empty.", "features (list of str, optional): Custom labels for the columns after shuffling.", "If not specified, default numerical labels are used.", "The list must match the number of columns in 'array'.", "seed (int, optional): Seed for the random number generator to ensure reproducibility of the shuffle."], "returns": ["Axes: The matplotlib Axes object containing the heatmap."], "reqs": ["numpy", "matplotlib.pyplot", "seaborn"], "raises": ["ValueError: If 'features' is provided and does not match the number of columns in 'array'; and", "if 'array' is empty or not 2-dimensional."], "example": [">>> np.random.seed(0)", ">>> array = np.random.rand(2, 5)", ">>> ax = f_820(array, features=['A', 'B', 'C', 'D', 'E'], seed=1)", ">>> type(ax)", "", ">>> ax.collections[0].get_array().data.flatten()", "array([0.60276338, 0.71518937, 0.4236548 , 0.5488135 , 0.54488318,", "0.891773 , 0.43758721, 0.38344152, 0.64589411, 0.96366276])"]}} +{"task_id": "f_813", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_813(data: np.ndarray) -> plt.Axes:\n \"\"\"\n Plots the cumulative probability distribution of a given NumPy array of numbers,\n representing how the cumulative probability increases with the sorted data indexes.\n\n Parameters:\n - data (numpy.ndarray): The input NumPy array of non-negative numbers.\n\n Returns:\n - matplotlib.pyplot.Axes: The plot of cumulative probabilities.\n\n Requirements:\n - numpy\n - matplotlib\n\n Raises:\n - ValueError: If the input array contains negative numbers or NaNs.\n - TypeError: If the input array contains non-numeric inputs.\n\n Note:\n - In case of an all-zeros input, the cumulative probability remains at 0 across all indexes.\n - The plot uses marker ('o') and a solid line ('-') for the cumulative probability curve.\n - The plot is titled \"Cumulative Probability Plot\", with \"Index\" on the x-axis and\n \"Cumulative Probability\" on the y-axis.\n\n Example:\n >>> ax = f_813(np.array([1, 2, 3, 4, 5]))\n >>> ax.get_title()\n 'Cumulative Probability Plot'\n \"\"\"", "canonical_solution": " if np.any(data < 0) or np.isnan(data).any():\n raise ValueError(\"Input array contains negative numbers or NaNs.\")\n\n if not np.issubdtype(data.dtype, np.number):\n raise TypeError(\"Input array contains non-numeric values.\")\n\n data_sorted = np.sort(data)\n cumulative_prob = (\n np.cumsum(data_sorted) / np.sum(data_sorted)\n if np.sum(data_sorted) != 0\n else np.zeros_like(data_sorted)\n )\n fig, ax = plt.subplots()\n ax.plot(cumulative_prob, marker=\"o\", linestyle=\"-\")\n ax.set_xlabel(\"Index\")\n ax.set_ylabel(\"Cumulative Probability\")\n ax.set_title(\"Cumulative Probability Plot\")\n\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.lines import Line2D\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n plt.close(\"all\")\n def helper_assert_plot_attributes(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n self.assertIn(\"Cumulative Probability Plot\", ax.get_title())\n self.assertIn(\"Index\", ax.get_xlabel())\n self.assertIn(\"Cumulative Probability\", ax.get_ylabel())\n lines = ax.get_lines()\n self.assertIsInstance(\n lines[0], Line2D, \"The plot should contain a Line2D object.\"\n )\n self.assertEqual(lines[0].get_marker(), \"o\", \"The marker should be 'o'.\")\n self.assertEqual(lines[0].get_linestyle(), \"-\", \"The linestyle should be '-'.\")\n def helper_assert_cumulative_probability_correctness(\n self, ax, expected_cumulative_prob\n ):\n line = ax.get_lines()[0]\n np.testing.assert_array_almost_equal(\n line.get_ydata(),\n expected_cumulative_prob,\n decimal=2,\n err_msg=\"Cumulative probability calculation is incorrect.\",\n )\n def test_negative_numbers(self):\n data = np.array([-1, 0, 1, 2, 3])\n with self.assertRaises(ValueError):\n f_813(data)\n def test_nan_values(self):\n data = np.array([1, 2, 3, np.nan, 5])\n with self.assertRaises(ValueError):\n f_813(data)\n def test_non_numeric_values(self):\n data = np.array([1, 2, 3, \"hello\", 5])\n with self.assertRaises(TypeError):\n f_813(data)\n def test_increasing_array(self):\n data = np.array([1, 2, 3])\n ax = f_813(data)\n expected_cumulative_prob = np.array([1 / 6, 1 / 2, 1])\n self.helper_assert_plot_attributes(ax=ax)\n self.helper_assert_cumulative_probability_correctness(\n ax=ax, expected_cumulative_prob=expected_cumulative_prob\n )\n def test_constant_array(self):\n data = np.array([1, 1, 1, 1, 1])\n ax = f_813(data)\n self.helper_assert_plot_attributes(ax)\n expected_cumulative_prob = np.array([0.2, 0.4, 0.6, 0.8, 1.0])\n self.helper_assert_cumulative_probability_correctness(\n ax=ax, expected_cumulative_prob=expected_cumulative_prob\n )\n def test_zeros_array(self):\n data = np.array([0, 0, 0, 0, 0])\n ax = f_813(data)\n self.helper_assert_plot_attributes(ax)\n expected_cumulative_prob = np.array([0, 0, 0, 0, 0])\n self.helper_assert_cumulative_probability_correctness(\n ax=ax, expected_cumulative_prob=expected_cumulative_prob\n )\n def test_single_element_array(self):\n data = np.array([7])\n ax = f_813(data)\n self.helper_assert_plot_attributes(ax)\n expected_cumulative_prob = np.array([1])\n self.helper_assert_cumulative_probability_correctness(\n ax=ax, expected_cumulative_prob=expected_cumulative_prob\n )", "apis": ["numpy.sum", "numpy.cumsum", "numpy.any", "numpy.zeros_like", "numpy.isnan", "numpy.sort", "numpy.number", "numpy.ndarray", "matplotlib.pyplot.subplots", "matplotlib.pyplot.Axes", "numpy.issubdtype"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Plots the cumulative probability distribution of a given NumPy array of numbers,", "representing how the cumulative probability increases with the sorted data indexes."], "note": ["In case of an all-zeros input, the cumulative probability remains at 0 across all indexes.", "The plot uses marker ('o') and a solid line ('-') for the cumulative probability curve.", "The plot is titled \"Cumulative Probability Plot\", with \"Index\" on the x-axis and", "\"Cumulative Probability\" on the y-axis."], "params": ["data (numpy.ndarray): The input NumPy array of non-negative numbers."], "returns": ["matplotlib.pyplot.Axes: The plot of cumulative probabilities."], "reqs": ["numpy", "matplotlib"], "raises": ["ValueError: If the input array contains negative numbers or NaNs.", "TypeError: If the input array contains non-numeric inputs."], "example": [">>> ax = f_813(np.array([1, 2, 3, 4, 5]))", ">>> ax.get_title()", "'Cumulative Probability Plot'"]}} +{"task_id": "f_836", "prompt": "import re\nimport pandas as pd\nfrom scipy.stats import gaussian_kde\nfrom scipy import linalg\nimport matplotlib.pyplot as plt\n\n\ndef f_836(text):\n \"\"\"\n This code takes a text input, calculates the lengths of the words, \n and visualizes the distribution of word lengths using a histogram and a KDE curve (if applicable) on a matplotlib subplot.\n\n Parameters:\n text (str): The text string to be analyzed. The function can handle strings with various types \n of characters and punctuation.\n\n Returns:\n matplotlib.axes._subplots.Axes: An Axes object showing the histogram and optionally the KDE \n plot of word lengths. This visual representation helps in \n understanding the distribution of word lengths in the given text.\n\n Requirements:\n - re\n - matplotlib\n - scipy\n - matplotlib\n\n Example:\n >>> ax = f_836('Hello world! This is a test.')\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " words = re.split(r\"\\W+\", text)\n word_counts = [len(word) for word in words if word]\n\n _, ax = plt.subplots()\n\n if word_counts: # Check if word_counts is not empty\n ax.hist(word_counts, bins=30, edgecolor='black', alpha=0.7)\n\n # Add KDE plot if applicable\n if len(word_counts) > 1 and np.var(word_counts) != 0:\n try:\n kde = gaussian_kde(word_counts)\n x_range = np.linspace(min(word_counts), max(word_counts), 100)\n ax.plot(x_range, kde(x_range), color='red') # KDE line in red\n except linalg.LinAlgError:\n # Handle the singular matrix error\n pass\n\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the f_836 function\"\"\"\n def test_simple_sentence(self):\n \"\"\"Test a simple sentence\"\"\"\n ax1 = f_836(\"This is a test\")\n self.assertIsInstance(ax1, plt.Axes)\n # The number of bars might differ due to matplotlib's binning strategy\n unique_word_lengths = {len(word) for word in \"This is a test\".split() if word}\n self.assertTrue(\n len(ax1.patches) >= len(unique_word_lengths),\n \"Incorrect number of bars for a simple sentence\",\n )\n def test_empty_string(self):\n \"\"\"Test an empty string\"\"\"\n ax2 = f_836(\"\")\n self.assertIsInstance(ax2, plt.Axes)\n self.assertEqual(\n len(ax2.patches), 0, \"There should be no bars for an empty string\"\n )\n def test_special_characters(self):\n \"\"\"Test special characters and numbers\"\"\"\n ax3 = f_836(\"Hello, world! 1234\")\n self.assertIsInstance(ax3, plt.Axes)\n # The number of bars might differ due to matplotlib's binning strategy\n unique_word_lengths = {\n len(word) for word in \"Hello, world! 1234\".split() if word\n }\n self.assertTrue(\n len(ax3.patches) >= len(unique_word_lengths),\n \"Incorrect handling of special characters and numbers\",\n )\n def test_repeated_words(self):\n \"\"\"Test repeated words\"\"\"\n ax4 = f_836(\"repeat repeat repeat\")\n self.assertIsInstance(ax4, plt.Axes)\n # Only one unique word length: 6\n self.assertTrue(len(ax4.patches) >= 1, \"Incorrect handling of repeated words\")\n def test_long_text(self):\n \"\"\"Test a long text\"\"\"\n text = \"A long text with multiple words of different lengths\"\n ax5 = f_836(text)\n self.assertIsInstance(ax5, plt.Axes)\n # Adjust expectation for number of bars due to matplotlib's binning\n words = re.split(r\"\\W+\", text)\n word_counts = pd.Series([len(word) for word in words if word])\n expected_unique_lengths = len(set(word_counts))\n self.assertTrue(\n len(ax5.patches) >= expected_unique_lengths,\n \"Incorrect plot for a long text\",\n )\n def tearDown(self):\n plt.clf()", "apis": ["matplotlib.pyplot.subplots", "scipy.stats.gaussian_kde", "re.split", "scipy.linalg.LinAlgError"], "libs": ["re", "matplotlib", "scipy"], "doc": {"description": ["This code takes a text input, calculates the lengths of the words,", "and visualizes the distribution of word lengths using a histogram and a KDE curve (if applicable) on a matplotlib subplot."], "note": [], "params": ["text (str): The text string to be analyzed. The function can handle strings with various types", "of characters and punctuation."], "returns": ["matplotlib.axes._subplots.Axes: An Axes object showing the histogram and optionally the KDE", "plot of word lengths. This visual representation helps in", "understanding the distribution of word lengths in the given text."], "reqs": ["re", "matplotlib", "scipy", "matplotlib"], "raises": [], "example": [">>> ax = f_836('Hello world! This is a test.')", ">>> type(ax)", ""]}} +{"task_id": "f_842", "prompt": "import urllib.request\nimport os\nimport csv\nimport collections\n\n\ndef f_842(url, column_name, csv_file_path):\n \"\"\"\n Download a CSV file from a given URL, save it to a specified path, and count\n the occurrences of each value in a particular column. The function handles various\n scenarios including missing columns and file download errors.\n\n Parameters:\n url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL.\n column_name (str): The name of the column in the CSV file whose values are to be counted.\n The function will raise a ValueError if this column is not found.\n csv_file_path (str): The file path where the downloaded CSV file will be saved.\n If a file already exists at this path, it will be overwritten.\n\n Returns:\n dict: A dictionary mapping the values from the specified column to their\n corresponding occurrence counts.\n\n Raises:\n ValueError: If the specified column_name does not exist in the CSV file, the function\n will delete the downloaded file and raise a ValueError with a message\n stating \"The provided column_name '{column_name}' does not exist in the CSV file.\"\n\n Requirements:\n - urllib\n - os\n - csv\n - collections\n\n Example:\n >>> f_842('http://example.com/data.csv', 'category', 'downloaded_data.csv')\n {'cat1': 5, 'cat2': 3, 'cat3': 8}\n # This is a hypothetical output; the actual output will depend on the CSV data.\n\n Notes:\n - The downloaded CSV file is deleted after its contents have been processed.\n - The function only counts values in the specified column and ignores other data.\n \"\"\"", "canonical_solution": " urllib.request.urlretrieve(url, csv_file_path)\n\n with open(csv_file_path, \"r\", encoding=\"utf-8\") as f:\n reader = csv.DictReader(f)\n if column_name not in reader.fieldnames:\n os.remove(csv_file_path)\n raise ValueError(\n f\"The provided column_name '{column_name}' does not exist in the CSV file.\"\n )\n values = [row[column_name] for row in reader]\n\n os.remove(csv_file_path)\n\n return collections.Counter(values)", "test": "import unittest\nfrom unittest.mock import patch, mock_open\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_842 function.\"\"\"\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"category,other\\n\" + \"cat1,x\\n\" * 2 + \"cat2,y\\n\" * 2 + \"cat3,z\\n\",\n )\n def test_count_categories_data1(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function counts the occurrences of each category in the CSV file.\"\"\"\n result = f_842(\"mock_url\", \"category\", \"/mock/path/data1.csv\")\n self.assertEqual(result, {\"cat1\": 2, \"cat2\": 2, \"cat3\": 1})\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"name,other\\n\" + \"Alice,x\\n\" * 2 + \"Bob,y\\n\" + \"Charlie,z\\n\",\n )\n def test_count_names_data2(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function counts the occurrences of each name in the CSV file.\"\"\"\n result = f_842(\"mock_url\", \"name\", \"/mock/path/data2.csv\")\n self.assertEqual(result, {\"Alice\": 2, \"Bob\": 1, \"Charlie\": 1})\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"category,other\\n\" + \"cat1,x\\n\" * 2 + \"cat2,y\\n\" + \"cat3,z\\n\" * 2,\n )\n def test_count_categories_data3(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function counts the occurrences of each category in the CSV file.\"\"\"\n result = f_842(\"mock_url\", \"category\", \"/mock/path/data3.csv\")\n self.assertEqual(result, {\"cat1\": 2, \"cat2\": 1, \"cat3\": 2})\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"name,other\\n\" + \"Alice,x\\n\" * 3 + \"Bob,y\\n\" + \"Charlie,z\\n\",\n )\n def test_count_names_data3(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function counts the occurrences of each name in the CSV file.\"\"\"\n result = f_842(\"mock_url\", \"name\", \"/mock/path/data3.csv\")\n self.assertEqual(result, {\"Alice\": 3, \"Bob\": 1, \"Charlie\": 1})\n @patch(\"os.remove\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=\"name,other\\n\" + \"Alice,x\\n\" * 3 + \"Bob,y\\n\" + \"Charlie,z\\n\",\n )\n def test_non_existent_column(self, mock_file, mock_urlretrieve, mock_remove):\n \"\"\"Test that the function raises an exception when the specified column does not exist.\"\"\"\n with self.assertRaises(ValueError):\n f_842(\"mock_url\", \"non_existent_column\", \"/mock/path/data3.csv\")", "apis": ["urllib.request", "csv.DictReader", "collections.Counter", "os.remove", "urllib.request.urlretrieve"], "libs": ["os", "collections", "urllib", "csv"], "doc": {"description": ["Download a CSV file from a given URL, save it to a specified path, and count", "the occurrences of each value in a particular column. The function handles various", "scenarios including missing columns and file download errors.", "Notes:", "- The downloaded CSV file is deleted after its contents have been processed.", "- The function only counts values in the specified column and ignores other data."], "note": [], "params": ["url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL.", "column_name (str): The name of the column in the CSV file whose values are to be counted.", "The function will raise a ValueError if this column is not found.", "csv_file_path (str): The file path where the downloaded CSV file will be saved.", "If a file already exists at this path, it will be overwritten."], "returns": ["dict: A dictionary mapping the values from the specified column to their", "corresponding occurrence counts."], "reqs": ["urllib", "os", "csv", "collections"], "raises": ["ValueError: If the specified column_name does not exist in the CSV file, the function", "will delete the downloaded file and raise a ValueError with a message", "stating \"The provided column_name '{column_name}' does not exist in the CSV file.\""], "example": [">>> f_842('http://example.com/data.csv', 'category', 'downloaded_data.csv')", "{'cat1': 5, 'cat2': 3, 'cat3': 8}", "# This is a hypothetical output; the actual output will depend on the CSV data."]}} +{"task_id": "f_397", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom datetime import datetime\n\n\ndef f_397(column, data):\n \"\"\"\n Analyze and visualize statistical properties of a specified weather data column.\n\n This function calculates the sum, mean, minimum, and maximum values of a specified column in the given data.\n It also generates a histogram plot of the data in the column. The dataset is expected to be a list of weather\n observations, where each observation includes date, temperature, humidity, wind speed, and precipitation values.\n If the provided data list is empty, resulting in an empty DataFrame, the function handles it by setting:\n - The 'mean' value to np.nan.\n - The 'min' value to np.inf.\n - The 'max' value to -np.inf.\n\n Parameters:\n column (str): The column to analyze. Valid columns include 'Temperature', 'Humidity', 'Wind Speed', and 'Precipitation'.\n data (list of lists): The weather data where each inner list contains the following format:\n [Date (datetime object), Temperature (int), Humidity (int), Wind Speed (int), Precipitation (float)]\n\n Returns:\n - result (dict): A dictionary containing:\n - 'sum': Sum of the values in the specified column.\n - 'mean': Mean of the values in the specified column.\n - 'min': Minimum value in the specified column.\n - 'max': Maximum value in the specified column.\n - 'plot': A matplotlib BarContainer object of the histogram plot for the specified column.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> data = [[datetime(2022, 1, 1), -5, 80, 10, 0], [datetime(2022, 1, 3), -2, 83, 15, 0]]\n >>> result = f_397('Temperature', data)\n >>> result['sum']\n -7\n >>> type(result['plot'])\n \n \"\"\"", "canonical_solution": " COLUMNS = [\"Date\", \"Temperature\", \"Humidity\", \"Wind Speed\", \"Precipitation\"]\n df = pd.DataFrame(data, columns=COLUMNS)\n column_data = df[column]\n\n result = {\n \"sum\": np.sum(column_data),\n \"mean\": np.nan if df.empty else np.mean(column_data),\n \"min\": np.inf if df.empty else np.min(column_data),\n \"max\": -np.inf if df.empty else np.max(column_data),\n }\n\n _, _, ax = plt.hist(column_data)\n plt.title(f\"Histogram of {column}\")\n\n result[\"plot\"] = ax\n\n return result", "test": "import unittest\nimport matplotlib\nimport matplotlib.pyplot as plt\nfrom datetime import datetime\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.data = [\n [datetime(2022, 1, 1), -5, 80, 10, 0],\n [datetime(2022, 1, 2), -3, 85, 12, 0.5],\n [datetime(2022, 1, 3), -2, 83, 15, 0],\n [datetime(2022, 1, 4), -1, 82, 13, 0.2],\n [datetime(2022, 1, 5), 0, 80, 11, 0.1],\n ]\n def test_case_1(self):\n # Testing the 'Temperature' column\n result = f_397(\"Temperature\", self.data)\n self.assertEqual(result[\"sum\"], -11)\n self.assertEqual(result[\"mean\"], -2.2)\n self.assertEqual(result[\"min\"], -5)\n self.assertEqual(result[\"max\"], 0)\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def test_case_2(self):\n # Testing the 'Humidity' column\n result = f_397(\"Humidity\", self.data)\n self.assertEqual(result[\"sum\"], 410)\n self.assertEqual(result[\"mean\"], 82)\n self.assertEqual(result[\"min\"], 80)\n self.assertEqual(result[\"max\"], 85)\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def test_case_3(self):\n # Testing the 'Wind Speed' column\n result = f_397(\"Wind Speed\", self.data)\n self.assertEqual(result[\"sum\"], 61)\n self.assertEqual(result[\"mean\"], 12.2)\n self.assertEqual(result[\"min\"], 10)\n self.assertEqual(result[\"max\"], 15)\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def test_case_4(self):\n # Testing the 'Precipitation' column\n result = f_397(\"Precipitation\", self.data)\n self.assertAlmostEqual(result[\"sum\"], 0.8, places=6)\n self.assertAlmostEqual(result[\"mean\"], 0.16, places=6)\n self.assertAlmostEqual(result[\"min\"], 0, places=6)\n self.assertAlmostEqual(result[\"max\"], 0.5, places=6)\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def test_case_5(self):\n # Testing with empty data\n result = f_397(\"Temperature\", [])\n self.assertTrue(np.isnan(result[\"mean\"]))\n self.assertEqual(result[\"sum\"], 0)\n self.assertTrue(\n np.isinf(result[\"min\"]) and result[\"min\"] > 0\n ) # Checking for positive infinity for min\n self.assertTrue(\n np.isinf(result[\"max\"]) and result[\"max\"] < 0\n ) # Checking for negative infinity for max\n self.assertIsInstance(result[\"plot\"], matplotlib.container.BarContainer)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.sum", "pandas.DataFrame", "matplotlib.pyplot.hist", "numpy.mean", "numpy.min", "matplotlib.pyplot.title", "numpy.nan", "numpy.inf", "numpy.max"], "libs": ["pandas", "numpy", "matplotlib"], "doc": {"description": ["Analyze and visualize statistical properties of a specified weather data column.", "This function calculates the sum, mean, minimum, and maximum values of a specified column in the given data.", "It also generates a histogram plot of the data in the column. The dataset is expected to be a list of weather", "observations, where each observation includes date, temperature, humidity, wind speed, and precipitation values.", "If the provided data list is empty, resulting in an empty DataFrame, the function handles it by setting:", "- The 'mean' value to np.nan.", "- The 'min' value to np.inf.", "- The 'max' value to -np.inf."], "note": [], "params": ["column (str): The column to analyze. Valid columns include 'Temperature', 'Humidity', 'Wind Speed', and 'Precipitation'.", "data (list of lists): The weather data where each inner list contains the following format:", "[Date (datetime object), Temperature (int), Humidity (int), Wind Speed (int), Precipitation (float)]"], "returns": ["result (dict): A dictionary containing:", "'sum': Sum of the values in the specified column.", "'mean': Mean of the values in the specified column.", "'min': Minimum value in the specified column.", "'max': Maximum value in the specified column.", "'plot': A matplotlib BarContainer object of the histogram plot for the specified column."], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [[datetime(2022, 1, 1), -5, 80, 10, 0], [datetime(2022, 1, 3), -2, 83, 15, 0]]", ">>> result = f_397('Temperature', data)", ">>> result['sum']", "-7", ">>> type(result['plot'])", ""]}} +{"task_id": "f_781", "prompt": "import re\nimport pandas as pd\n\ndef f_781(input_df):\n \"\"\"\n Cleans the text in a pandas DataFrame column named 'text' by removing all special characters, punctuation marks, and spaces, then calculates the length of the cleaned text.\n\n Requirements:\n - re\n - pandas\n\n Parameters:\n - input_df (pandas.DataFrame): DataFrame with a column 'text' containing strings with alphanumeric and/or special characters.\n\n Returns:\n - pandas.DataFrame: A DataFrame with two new columns 'clean_text' and 'text_length', where 'clean_text' is the cleaned text and 'text_length' is its length.\n\n Examples:\n >>> df = pd.DataFrame({'text': ['Special $#! characters spaces 888323']})\n >>> print(f_781(df))\n clean_text text_length\n 0 Specialcharactersspaces888323 29\n >>> df = pd.DataFrame({'text': ['Hello, World!']})\n >>> print(f_781(df))\n clean_text text_length\n 0 HelloWorld 10\n \"\"\"", "canonical_solution": " def clean_text_and_calculate_length(row):\n if pd.isnull(row['text']):\n return pd.Series(['', 0], index=['clean_text', 'text_length'])\n cleaned_text = re.sub('[^A-Za-z0-9]+', '', str(row['text']))\n return pd.Series([cleaned_text, len(cleaned_text)], index=['clean_text', 'text_length'])\n \n return input_df.apply(clean_text_and_calculate_length, axis=1)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.df = pd.DataFrame({'text': ['hello', 'world', 'Special $#! characters spaces 888323', 'Hello, World!', '', None]})\n def test_clean_text_and_calculate_length(self):\n result = f_781(self.df)\n expected_clean_text = ['hello', 'world', 'Specialcharactersspaces888323', 'HelloWorld', '', '']\n expected_text_length = [5, 5, 29, 10, 0, 0]\n pd.testing.assert_series_equal(result['clean_text'], pd.Series(expected_clean_text, name='clean_text'), check_names=False)\n pd.testing.assert_series_equal(result['text_length'], pd.Series(expected_text_length, name='text_length'), check_names=False)\n def test_with_special_characters(self):\n df = pd.DataFrame({'text': ['@@@hello***', '%%%world$$$']})\n result = f_781(df)\n self.assertEqual(result['clean_text'].iloc[0], 'hello')\n self.assertEqual(result['clean_text'].iloc[1], 'world')\n self.assertEqual(result['text_length'].iloc[0], 5)\n self.assertEqual(result['text_length'].iloc[1], 5)\n def test_with_numeric_strings(self):\n df = pd.DataFrame({'text': ['123', '4567']})\n result = f_781(df)\n self.assertEqual(result['clean_text'].iloc[0], '123')\n self.assertEqual(result['clean_text'].iloc[1], '4567')\n self.assertEqual(result['text_length'].iloc[0], 3)\n self.assertEqual(result['text_length'].iloc[1], 4)\n def test_empty_and_none(self):\n df = pd.DataFrame({'text': ['', None]})\n result = f_781(df)\n self.assertEqual(result['clean_text'].iloc[0], '')\n self.assertEqual(result['clean_text'].iloc[1], '')\n self.assertEqual(result['text_length'].iloc[0], 0)\n self.assertEqual(result['text_length'].iloc[1], 0)\n def test_mixed_cases(self):\n df = pd.DataFrame({'text': ['HelloWorld', 'HELLOworld123']})\n result = f_781(df)\n self.assertEqual(result['clean_text'].iloc[0], 'HelloWorld')\n self.assertEqual(result['clean_text'].iloc[1], 'HELLOworld123')\n self.assertEqual(result['text_length'].iloc[0], 10)\n self.assertEqual(result['text_length'].iloc[1], 13)", "apis": ["pandas.isnull", "re.sub", "pandas.Series"], "libs": ["re", "pandas"], "doc": {"description": ["Cleans the text in a pandas DataFrame column named 'text' by removing all special characters, punctuation marks, and spaces, then calculates the length of the cleaned text."], "note": [], "params": ["input_df (pandas.DataFrame): DataFrame with a column 'text' containing strings with alphanumeric and/or special characters."], "returns": ["pandas.DataFrame: A DataFrame with two new columns 'clean_text' and 'text_length', where 'clean_text' is the cleaned text and 'text_length' is its length."], "reqs": ["re", "pandas"], "raises": [], "example": ["Examples:", ">>> df = pd.DataFrame({'text': ['Special $#! characters spaces 888323']})", ">>> print(f_781(df))", "clean_text text_length", "0 Specialcharactersspaces888323 29", ">>> df = pd.DataFrame({'text': ['Hello, World!']})", ">>> print(f_781(df))", "clean_text text_length", "0 HelloWorld 10"]}} +{"task_id": "f_383", "prompt": "from datetime import datetime, timedelta\nimport pytz\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_383(start_time, end_time):\n \"\"\"\n Plots the hourly difference between UTC and specified global time zones across a date range.\n\n This function visualizes the time difference in hours between UTC and predefined time zones for each day\n within the specified date range. Predefined time zones include UTC, America/Los_Angeles, Europe/Paris,\n Asia/Kolkata, and Australia/Sydney. The differences are plotted on a graph, using a distinct color for\n each time zone's time difference curve, selecting from [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"].\n\n Parameters:\n - start_time (str): The start date in the format \"yyyy-mm-dd\".\n - end_time (str): The end date in the format \"yyyy-mm-dd\".\n\n Returns:\n - matplotlib.axes.Axes: The Axes object with the plotted time differences in hours between UTC and \n other time zones.\n\n Requirements:\n - datetime.datetime\n - datetime.timedelta\n - pytz\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_383('2021-01-01', '2021-01-10')\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(18628.0, 0, '2021-01-01'), Text(18629.0, 0, '2021-01-02'), Text(18630.0, 0, '2021-01-03'), Text(18631.0, 0, '2021-01-04'), Text(18632.0, 0, '2021-01-05'), Text(18633.0, 0, '2021-01-06'), Text(18634.0, 0, '2021-01-07'), Text(18635.0, 0, '2021-01-08'), Text(18636.0, 0, '2021-01-09')]\n \"\"\"", "canonical_solution": " # Constants\n TIMEZONES = [\n \"UTC\",\n \"America/Los_Angeles\",\n \"Europe/Paris\",\n \"Asia/Kolkata\",\n \"Australia/Sydney\",\n ]\n COLORS = [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"]\n\n start_date = datetime.strptime(start_time, \"%Y-%m-%d\")\n end_date = datetime.strptime(end_time, \"%Y-%m-%d\")\n current_tz = pytz.timezone(\"UTC\")\n dates = np.arange(start_date, end_date, timedelta(days=1)).astype(datetime)\n differences = []\n for tz in TIMEZONES:\n other_tz = pytz.timezone(tz)\n difference = [\n (other_tz.localize(dt) - current_tz.localize(dt)).total_seconds() / 3600\n for dt in dates\n ]\n differences.append(difference)\n fig, ax = plt.subplots()\n for i, difference in enumerate(differences):\n ax.plot(dates, difference, color=COLORS[i % len(COLORS)], label=TIMEZONES[i])\n ax.set_xlabel(\"Date\")\n ax.set_ylabel(\"Time difference (hours)\")\n ax.legend()\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic functionality\n ax = f_383(\"2021-01-01\", \"2021-01-10\")\n self._common_assertions(ax)\n def test_case_2(self):\n # Test single day range\n ax = f_383(\"2021-01-01\", \"2021-01-01\")\n self._common_assertions(ax)\n def test_case_3(self):\n # Test leap year\n ax = f_383(\"2020-02-28\", \"2020-03-01\")\n self._common_assertions(ax)\n def test_case_4(self):\n # Test DST transition\n ax = f_383(\"2021-03-27\", \"2021-03-29\")\n self._common_assertions(ax)\n def test_case_5(self):\n # Test plotting consistency\n ax = f_383(\"2021-01-01\", \"2021-01-10\")\n colors = [line.get_color() for line in ax.get_lines()]\n self.assertEqual(len(set(colors)), len(colors)) # Check if colors are unique\n def test_case_6(self):\n # Testing input validation via invalid date format\n with self.assertRaises(ValueError):\n f_383(\"01-01-2021\", \"10-01-2021\")\n def _common_assertions(self, ax):\n \"\"\"Common assertions for all test cases\"\"\"\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_xlabel(), \"Date\")\n self.assertEqual(ax.get_ylabel().lower(), \"time difference (hours)\".lower())\n legend_labels = [text.get_text() for text in ax.get_legend().get_texts()]\n expected_timezones = [\n \"UTC\",\n \"America/Los_Angeles\",\n \"Europe/Paris\",\n \"Asia/Kolkata\",\n \"Australia/Sydney\",\n ]\n self.assertListEqual(legend_labels, expected_timezones)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "pytz.timezone", "datetime.datetime.strptime", "matplotlib.pyplot.subplots", "datetime.timedelta"], "libs": ["numpy", "matplotlib", "pytz", "datetime"], "doc": {"description": ["Plots the hourly difference between UTC and specified global time zones across a date range.", "This function visualizes the time difference in hours between UTC and predefined time zones for each day", "within the specified date range. Predefined time zones include UTC, America/Los_Angeles, Europe/Paris,", "Asia/Kolkata, and Australia/Sydney. The differences are plotted on a graph, using a distinct color for", "each time zone's time difference curve, selecting from [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"]."], "note": [], "params": ["start_time (str): The start date in the format \"yyyy-mm-dd\".", "end_time (str): The end date in the format \"yyyy-mm-dd\"."], "returns": ["matplotlib.axes.Axes: The Axes object with the plotted time differences in hours between UTC and", "other time zones."], "reqs": ["datetime.datetime", "datetime.timedelta", "pytz", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_383('2021-01-01', '2021-01-10')", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(18628.0, 0, '2021-01-01'), Text(18629.0, 0, '2021-01-02'), Text(18630.0, 0, '2021-01-03'), Text(18631.0, 0, '2021-01-04'), Text(18632.0, 0, '2021-01-05'), Text(18633.0, 0, '2021-01-06'), Text(18634.0, 0, '2021-01-07'), Text(18635.0, 0, '2021-01-08'), Text(18636.0, 0, '2021-01-09')]"]}} +{"task_id": "f_818", "prompt": "import numpy as np\nimport pandas as pd\n\ndef f_818(rows, columns=[\"A\", \"B\", \"C\", \"D\", \"E\"], seed=0) -> pd.DataFrame:\n \"\"\"\n Create a Pandas DataFrame with a specified number of rows filled with random\n values in [0, 1) and shuffled columns.\n \n Note:\n - The columns should be unique and sorted in the ascending order.\n\n Parameters:\n rows (int): The number of rows for the DataFrame. Must not be negative.\n columns (list of str): Column names for the DataFrame.\n Defaults to ['A', 'B', 'C', 'D', 'E'].\n If it contains repeated columns, the function deduplicates\n it in a case and spacing sensitive way. If it is empty,\n the function returns an empty DataFrame.\n seed (int): The random seed for reproducibility.\n \n Returns:\n pd.DataFrame: A pandas DataFrame with shuffled columns.\n\n Requirements:\n - numpy\n - pandas\n\n Example:\n >>> df = f_818(10)\n >>> df.head(2)\n D E A C B\n 0 0.548814 0.715189 0.602763 0.544883 0.423655\n 1 0.645894 0.437587 0.891773 0.963663 0.383442\n \"\"\"", "canonical_solution": " np.random.seed(seed)\n columns = sorted(list(set(columns)))\n data = np.random.rand(rows, len(columns))\n np.random.shuffle(columns)\n df = pd.DataFrame(data, columns=columns)\n return df", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case - data and format correctness\n df = f_818(10, seed=0)\n default_columns = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n self.assertEqual(df.shape, (10, 5))\n for column in default_columns:\n self.assertEqual(df.dtypes[column], np.float64)\n self.assertEqual(len(set(df.columns)), len(default_columns))\n def test_case_2(self):\n # Test custom columns\n custom_columns = [\"X\", \"Y\", \"Z\"]\n df = f_818(5, columns=custom_columns, seed=0)\n self.assertTrue(all(column in custom_columns for column in df.columns))\n # assert first 2 rows data\n self.assertEqual(df.iloc[0].tolist(), [0.5488135039273248, 0.7151893663724195, 0.6027633760716439])\n \n def test_case_3(self):\n # Test custom rows\n for n_rows in [1, 10, 50]:\n df = f_818(n_rows)\n self.assertEqual(len(df), n_rows)\n def test_case_4(self):\n df = f_818(5, seed=42)\n self.assertEqual(df.iloc[0].tolist(), [0.3745401188473625, 0.9507143064099162, 0.7319939418114051, 0.5986584841970366, 0.15601864044243652])\n def test_case_5(self):\n # Test handling edge cases - negative rows\n with self.assertRaises(ValueError):\n f_818(-1)\n def test_case_6(self):\n # Test handling empty columns\n df = f_818(5, columns=[])\n self.assertTrue(df.empty)\n self.assertEqual(df.shape, (5, 0))\n def test_case_7(self):\n # Test handling duplicate columns\n df = f_818(5, columns=[\"A\", \"A\", \"B\", \"B\", \"C\"], seed=0)\n self.assertEqual(len(df.columns), 3)", "apis": ["numpy.random.shuffle", "pandas.DataFrame", "numpy.random", "numpy.random.rand", "numpy.random.seed"], "libs": ["pandas", "numpy"], "doc": {"description": ["Create a Pandas DataFrame with a specified number of rows filled with random", "values in [0, 1) and shuffled columns."], "note": ["The columns should be unique and sorted in the ascending order."], "params": ["rows (int): The number of rows for the DataFrame. Must not be negative.", "columns (list of str): Column names for the DataFrame.", "Defaults to ['A', 'B', 'C', 'D', 'E'].", "If it contains repeated columns, the function deduplicates", "it in a case and spacing sensitive way. If it is empty,", "the function returns an empty DataFrame.", "seed (int): The random seed for reproducibility."], "returns": ["pd.DataFrame: A pandas DataFrame with shuffled columns."], "reqs": ["numpy", "pandas"], "raises": [], "example": [">>> df = f_818(10)", ">>> df.head(2)", "D E A C B", "0 0.548814 0.715189 0.602763 0.544883 0.423655", "1 0.645894 0.437587 0.891773 0.963663 0.383442"]}} +{"task_id": "f_327", "prompt": "import random\nimport matplotlib.pyplot as plt\n\n\ndef f_327(points: int):\n \"\"\"\n Generate a plot of random numbers such that indices are on the x-axis and generated numbers are on the y-axis.\n\n Parameters:\n - points (int): Number of random points to generate.\n\n Returns:\n - Returns a tuple containing:\n - A list of generated random numbers.\n - A matplotlib Axes object representing the plot.\n\n Requirements:\n - random\n - matplotlib.pyplot\n\n Example:\n >>> import random\n >>> random.seed(0)\n >>> f_327(5)\n ([0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085], )\n >>> f_327(3)\n ([0.4049341374504143, 0.7837985890347726, 0.30331272607892745], )\n \"\"\"", "canonical_solution": " x = list(range(points))\n y = [random.random() for _ in range(points)]\n\n _, ax = plt.subplots()\n ax.plot(x, y)\n\n return y, ax", "test": "import unittest\nimport random\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n random.seed(0)\n y, _ = f_327(5)\n # Test correct number of points are generated\n self.assertEqual(len(y), 5)\n def test_case_2(self):\n random.seed(0)\n y, _ = f_327(5)\n # Test expected values\n self.assertTrue(all(0 <= num <= 1 for num in y))\n self.assertAlmostEqual(\n y,\n [\n 0.8444218515250481,\n 0.7579544029403025,\n 0.420571580830845,\n 0.25891675029296335,\n 0.5112747213686085,\n ],\n )\n def test_case_3(self):\n random.seed(0)\n # Test incorrect data types\n with self.assertRaises(TypeError):\n f_327(\"5\")\n with self.assertRaises(TypeError):\n f_327([])\n with self.assertRaises(TypeError):\n f_327(None)\n def test_case_4(self):\n random.seed(0)\n # Test handling 1 number\n y, ax = f_327(1)\n # Assert that 1 random number is generated\n self.assertEqual(len(y), 1)\n # Assert that the plot has the correct x and y data\n self.assertEqual(list(ax.lines[0].get_xdata()), [0])\n self.assertEqual(list(ax.lines[0].get_ydata()), y)\n def test_case_5(self):\n random.seed(0)\n # Test handling no random numbers\n y, ax = f_327(0)\n self.assertEqual(len(y), 0)\n # Assert that the plot has no data\n self.assertEqual(list(ax.lines[0].get_xdata()), [])\n self.assertEqual(list(ax.lines[0].get_ydata()), [])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "random.random"], "libs": ["random", "matplotlib"], "doc": {"description": ["Generate a plot of random numbers such that indices are on the x-axis and generated numbers are on the y-axis."], "note": [], "params": ["points (int): Number of random points to generate."], "returns": ["Returns a tuple containing:", "A list of generated random numbers.", "A matplotlib Axes object representing the plot."], "reqs": ["random", "matplotlib.pyplot"], "raises": [], "example": [">>> import random", ">>> random.seed(0)", ">>> f_327(5)", "([0.8444218515250481, 0.7579544029403025, 0.420571580830845, 0.25891675029296335, 0.5112747213686085], )", ">>> f_327(3)", "([0.4049341374504143, 0.7837985890347726, 0.30331272607892745], )"]}} +{"task_id": "f_532", "prompt": "import os\nimport random\nimport json\n\ndef f_532(directory, n):\n \"\"\"\n Create n random files in a directory with json content with the key 'number' and a random integer value between 1 and 100, and then reset the cursor to the beginning of each file.\n\n Parameters:\n - directory (str): The directory in which to generate the files.\n - n (int): The number of files to generate.\n\n Returns:\n - directory (str): The directory in which the files were generated.\n\n Requirements:\n - os\n - random\n - json\n\n Example:\n >>> f_532('/path/to/directory', 1)\n '/path/to/directory'\n \"\"\"", "canonical_solution": " if not os.path.exists(directory):\n os.makedirs(directory)\n\n for i in range(n):\n filename = str(i) + \".json\"\n filepath = os.path.join(directory, filename)\n\n with open(filepath, 'w') as file:\n json.dump({'number': random.randint(1, 100)}, file)\n file.seek(0)\n\n return directory", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n shutil.rmtree('./source', ignore_errors=True)\n shutil.rmtree('./src', ignore_errors=True)\n shutil.rmtree('./s', ignore_errors=True)\n def test_case_1(self):\n random.seed(0)\n directory = f_532('./source', 10)\n self.assertTrue(os.path.exists(directory))\n read_data = []\n for file in sorted(os.listdir(directory)):\n with open(os.path.join(directory, file), 'r') as f:\n read_data.append(json.load(f))\n self.assertEqual(read_data, [{'number': 50}, {'number': 98}, {'number': 54}, {'number': 6}, {'number': 34}, {'number': 66}, {'number': 63}, {'number': 52}, {'number': 39}, {'number': 62}])\n shutil.rmtree(directory)\n def test_case_2(self):\n random.seed(1)\n directory = f_532('./src', 1)\n self.assertTrue(os.path.exists(directory))\n read_data = []\n for file in os.listdir(directory):\n with open(os.path.join(directory, file), 'r') as f:\n read_data.append(json.load(f))\n self.assertEqual(read_data, [{'number': 18}])\n shutil.rmtree(directory)\n def test_case_3(self):\n directory = f_532('./s', 100)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 100)\n shutil.rmtree(directory)\n def test_case_4(self):\n directory = f_532('./s', 0)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 0)\n shutil.rmtree(directory)\n def test_case_5(self):\n random.seed(2)\n directory = f_532('./source', 1)\n self.assertTrue(os.path.exists(directory))\n read_data = []\n for file in os.listdir(directory):\n with open(os.path.join(directory, file), 'r') as f:\n read_data.append(json.load(f))\n self.assertEqual(read_data, [{'number': 8}])\n shutil.rmtree(directory)", "apis": ["json.dump", "os.makedirs", "os.path", "random.randint", "os.path.join", "os.path.exists"], "libs": ["os", "json", "random"], "doc": {"description": ["Create n random files in a directory with json content with the key 'number' and a random integer value between 1 and 100, and then reset the cursor to the beginning of each file."], "note": [], "params": ["directory (str): The directory in which to generate the files.", "n (int): The number of files to generate."], "returns": ["directory (str): The directory in which the files were generated."], "reqs": ["os", "random", "json"], "raises": [], "example": [">>> f_532('/path/to/directory', 1)", "'/path/to/directory'"]}} +{"task_id": "f_806", "prompt": "import os\nimport glob\nfrom pathlib import Path\nimport zipfile\n\n\ndef f_806(source_directory, target_directory, zip_name):\n \"\"\"\n Zip files with certain extensions from a source directory and save it as a zip file\n saved to a target directory.\n\n Parameters:\n - source_directory (str): The source directory containing the files to be zipped.\n - target_directory (str): The destination directory of the zip file to be created.\n If it does not exist, the function will create it.\n - zip_name (str): The name of the zip file to create (without extension; '.zip' will be added automatically).\n\n Returns:\n - str: The full path to the created zip file in the format \"/path/to/target_directory/zip_name.zip\".\n\n Raises:\n - OSError: If the source_directory does not exist.\n\n Requirements:\n - os\n - glob\n - pathlib\n - zipfile\n\n Note:\n - The valid extensions are: ['.txt', '.docx', '.xlsx', '.csv'].\n\n\n Example:\n >>> path = f_806('/path/to/source_directory', '/path/to/target_directory', 'zipped_files')\n >>> type(path)\n \n >>> path\n '/path/to/target_directory/zipped_files.zip'\n \"\"\"", "canonical_solution": " if not os.path.exists(source_directory):\n raise OSError(\"source_directory must exist.\")\n if not os.path.exists(target_directory):\n os.makedirs(target_directory, exist_ok=True)\n\n zip_path = os.path.join(target_directory, f\"{zip_name.strip()}.zip\")\n with zipfile.ZipFile(zip_path, \"w\") as zipf:\n for extension in [\".txt\", \".docx\", \".xlsx\", \".csv\"]:\n for file in glob.glob(\n f\"{source_directory}/**/*{extension}\", recursive=True\n ):\n zipf.write(file, arcname=Path(file).name)\n\n return os.path.abspath(zip_path)", "test": "import unittest\nimport tempfile\nimport os\nfrom pathlib import Path\nimport zipfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_source_dir = tempfile.TemporaryDirectory()\n self.temp_target_dir = tempfile.TemporaryDirectory()\n self.test_source_dir = self.temp_source_dir.name\n self.test_target_dir = self.temp_target_dir.name\n # Setup directory and files structure for testing\n self.files_structure = {\n \"empty_dir\": [],\n \"no_matching_files\": [\"a.pdf\", \"b.gif\"],\n \"some_matching_files\": [\"c.txt\", \"d.docx\", \"e.png\"],\n \"all_matching_files\": [\"f.txt\", \"g.docx\", \"h.xlsx\", \"i.csv\"],\n \"nested_dir\": [\"nested/j.txt\", \"nested/k.docx\", \"nested/l.png\"],\n \"deeply_nested_dir\": [\"deep/nested/m.xlsx\", \"deep/nested/n.csv\"],\n \"mixed_extensions\": [\"o.txt\", \"p.docx\", \"q.unknown\", \"r.csv\"],\n \"subdirs_with_files\": [\n \"subdir1/s.txt\",\n \"subdir2/t.xlsx\",\n \"subdir3/u.docx\",\n \"subdir2/v.csv\",\n ],\n }\n for dir_key, files in self.files_structure.items():\n if files:\n for file_path in files:\n full_path = os.path.join(self.test_source_dir, dir_key, file_path)\n os.makedirs(os.path.dirname(full_path), exist_ok=True)\n with open(full_path, \"w\") as f:\n f.write(\"dummy content\")\n else:\n os.makedirs(os.path.join(self.test_source_dir, dir_key), exist_ok=True)\n def tearDown(self):\n self.temp_source_dir.cleanup()\n self.temp_target_dir.cleanup()\n def zip_file_count(self, zip_path):\n extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n return sum(\n 1 for item in zip_ref.namelist() if Path(item).suffix in extensions\n )\n def test_case_1(self):\n # Test empty directory\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"empty_dir\"),\n self.test_target_dir,\n \"empty_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 0)\n def test_case_2(self):\n # Test no matching files\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"no_matching_files\"),\n self.test_target_dir,\n \"no_match_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 0)\n def test_case_3(self):\n # Test some matching files\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"some_matching_files\"),\n self.test_target_dir,\n \"some_match_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 2)\n def test_case_4(self):\n # Test all matching files\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"all_matching_files\"),\n self.test_target_dir,\n \"all_match_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 4)\n def test_case_5(self):\n # Test nested directory\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"nested_dir\"),\n self.test_target_dir,\n \"nested_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 2)\n def test_case_6(self):\n # Test mixed extension\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"mixed_extensions\"),\n self.test_target_dir,\n \"mixed_extensions_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 3)\n def test_case_7(self):\n # Test subdirectories with files\n zip_path = f_806(\n os.path.join(self.test_source_dir, \"subdirs_with_files\"),\n self.test_target_dir,\n \"subdirs_with_files_test\",\n )\n self.assertEqual(self.zip_file_count(zip_path), 4)", "apis": ["zipfile.ZipFile", "os.makedirs", "glob.glob", "os.path", "pathlib.Path", "os.path.join", "os.path.exists", "os.path.abspath"], "libs": ["zipfile", "glob", "pathlib", "os"], "doc": {"description": ["Zip files with certain extensions from a source directory and save it as a zip file", "saved to a target directory."], "note": ["The valid extensions are: ['.txt', '.docx', '.xlsx', '.csv']."], "params": ["source_directory (str): The source directory containing the files to be zipped.", "target_directory (str): The destination directory of the zip file to be created.", "If it does not exist, the function will create it.", "zip_name (str): The name of the zip file to create (without extension; '.zip' will be added automatically)."], "returns": ["str: The full path to the created zip file in the format \"/path/to/target_directory/zip_name.zip\"."], "reqs": ["os", "glob", "pathlib", "zipfile"], "raises": ["OSError: If the source_directory does not exist."], "example": [">>> path = f_806('/path/to/source_directory', '/path/to/target_directory', 'zipped_files')", ">>> type(path)", "", ">>> path", "'/path/to/target_directory/zipped_files.zip'"]}} +{"task_id": "f_350", "prompt": "import numpy as np\nfrom scipy.spatial import Voronoi, voronoi_plot_2d\nimport matplotlib.pyplot as plt\n\n\ndef f_350(points, seed=0):\n \"\"\"\n Calculate the Voronoi diagram for a number of points in 2D and plot it.\n Note: this function will raise errors when input is invalid, for example wrong type or shape.\n Jittering is applied prior to plotting.\n\n Parameters:\n - points (np.ndarray): A numpy ndarray of shape (n_points, 2) with the coordinates of the points.\n - seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n tuple (vor, ax): A tuple containing:\n - vor (Voronoi): A Voronoi object representing the Voronoi diagram of the points.\n - ax (Axes): The axes of the plotted Voronoi diagram.\n\n Requirements:\n - numpy\n - scipy\n - matplotlib.pyplot\n\n Example:\n >>> points = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])\n >>> vor, ax = f_350(points)\n >>> type(vor)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if not isinstance(points, np.ndarray):\n raise TypeError(\"Expected Numpy array\")\n if len(points) < 3:\n raise ValueError(\"Voronoi diagram needs at least 3 points\")\n if points.shape[-1] != 2:\n raise ValueError(\"Expected array of 2D points\")\n\n np.random.seed(seed)\n\n # Add a slight random jitter to the points\n jittered_points = points + np.random.normal(0, 1e-10, points.shape)\n\n vor = Voronoi(jittered_points)\n fig, ax = plt.subplots()\n voronoi_plot_2d(vor, ax=ax)\n\n return vor, ax", "test": "import unittest\nimport numpy as np\nfrom scipy.spatial import Voronoi\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.points = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])\n def test_case_1(self):\n # Standard tests\n vor, ax = f_350(self.points)\n self._run_test(self.points, vor, ax)\n def test_case_2(self):\n # Test random seed\n vor, _ = f_350(self.points, seed=0)\n vor1, _ = f_350(self.points, seed=0)\n vor2, _ = f_350(self.points, seed=1)\n self.assertTrue((vor.ridge_points == vor1.ridge_points).all())\n self.assertFalse((vor1.ridge_points == vor2.ridge_points).all())\n def test_case_3(self):\n # Test with points that are extremely close to each other\n points = np.array([[0, 0], [0, 1e-12], [1, 0]])\n vor, ax = f_350(points)\n self._run_test(points, vor, ax)\n def test_case_4(self):\n # Test with fewer than three points, which is the minimum to form a Voronoi diagram.\n points = np.array([[0, 0], [1, 1]])\n with self.assertRaises(Exception):\n f_350(points)\n def test_case_5(self):\n # Test with invalid input shapes, such as one-dimensional array.\n points = np.array([1, 2, 3])\n with self.assertRaises(Exception):\n f_350(points)\n def test_case_6(self):\n # Test with invalid input types\n with self.assertRaises(Exception):\n f_350(\"Not valid points\")\n def _run_test(self, points, vor, ax):\n # Check the point_region attribute of Voronoi object\n self.assertIsInstance(vor, Voronoi)\n self.assertEqual(len(vor.point_region), len(points))\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(len(ax.get_children()) > 0, \"The plot should have elements.\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.random", "scipy.spatial.Voronoi", "scipy.spatial.voronoi_plot_2d", "numpy.ndarray", "matplotlib.pyplot.subplots", "numpy.random.normal", "numpy.random.seed"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Calculate the Voronoi diagram for a number of points in 2D and plot it."], "note": ["this function will raise errors when input is invalid, for example wrong type or shape.", "Jittering is applied prior to plotting."], "params": ["points (np.ndarray): A numpy ndarray of shape (n_points, 2) with the coordinates of the points.", "seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["tuple (vor, ax): A tuple containing:", "vor (Voronoi): A Voronoi object representing the Voronoi diagram of the points.", "ax (Axes): The axes of the plotted Voronoi diagram."], "reqs": ["numpy", "scipy", "matplotlib.pyplot"], "raises": [], "example": [">>> points = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])", ">>> vor, ax = f_350(points)", ">>> type(vor)", "", ">>> type(ax)", ""]}} +{"task_id": "f_414", "prompt": "import json\nimport pandas as pd\nimport numpy as np\nfrom collections import defaultdict\n\n\ndef f_414(input_file=\"data.json\"):\n \"\"\"\n Read a list of dictionaries from a JSON file, calculate the mean and median for each key\n (ignoring non-numeric or missing values), and convert the results into a Pandas DataFrame.\n\n Parameters:\n - input_file (str, optional): The input JSON file name. Defaults to 'data.json'.\n The file should contain a list of dictionaries. If a key is\n missing in a dictionary, it is treated as NaN for that record.\n Non-numeric values are ignored for the calculation of mean\n and median. If all values for a key are non-numeric or missing,\n the statistics for that key will be NaN.\n\n Returns:\n - df (pd.DataFrame): A DataFrame indexed and sorted by the variable names (keys) from the\n input data, containing columns 'mean' and 'median'.\n\n Requirements:\n - numpy\n - collections\n - json\n - pandas\n\n Example:\n >>> df = f_414('data_1.json')\n a mean median\n b mean median\n c mean median\n \"\"\"", "canonical_solution": " with open(input_file, \"r\") as f:\n data = json.load(f)\n\n all_keys = set().union(*(d.keys() for d in data))\n stats = defaultdict(list)\n for d in data:\n for key in all_keys:\n value = d.get(key, np.nan)\n if isinstance(value, (int, float)):\n stats[key].append(value)\n else:\n stats[key].append(np.nan)\n\n result = {\n k: {\"mean\": np.nanmean(v), \"median\": np.nanmedian(v)} for k, v in stats.items()\n }\n df = pd.DataFrame(result).transpose().sort_index()\n\n return df", "test": "import unittest\nimport numpy as np\nimport tempfile\nimport json\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.temp_dir = tempfile.TemporaryDirectory()\n cls.test_data_paths = []\n test_data = [\n [{\"a\": 2, \"b\": 3, \"c\": 4}], # Test data for test_case_1\n [{\"a\": 1}], # Test data for test_case_2\n [{\"a\": 1.5}, {\"b\": None}], # Test data for test_case_3\n [], # Test data for test_case_4\n [{\"a\": 1.5, \"c\": 4}, {\"b\": None}], # Test data for test_case_5\n ]\n for idx, data in enumerate(test_data, start=1):\n path = cls.temp_dir.name + f\"/test_data_{idx}.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n cls.test_data_paths.append(path)\n def test_case_1(self):\n # Basic test\n df = f_414(self.test_data_paths[0])\n self.assertListEqual(df.index.tolist(), [\"a\", \"b\", \"c\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 2.0)\n self.assertAlmostEqual(df.loc[\"a\", \"median\"], 2.0)\n def test_case_2(self):\n # Test with a single key\n df = f_414(self.test_data_paths[1])\n self.assertListEqual(df.index.tolist(), [\"a\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 1.0)\n self.assertAlmostEqual(df.loc[\"a\", \"median\"], 1.0)\n def test_case_3(self):\n # Test with missing values to ensure handling of NaN\n df = f_414(self.test_data_paths[2])\n self.assertListEqual(df.index.tolist(), [\"a\", \"b\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 1.5)\n self.assertAlmostEqual(df.loc[\"a\", \"median\"], 1.5)\n self.assertTrue(np.isnan(df.loc[\"b\", \"mean\"]))\n self.assertTrue(np.isnan(df.loc[\"b\", \"median\"]))\n def test_case_4(self):\n # Test empty dataframe creation from an empty input file\n df = f_414(self.test_data_paths[3])\n self.assertEqual(df.shape[0], 0)\n def test_case_5(self):\n # Test handling of mixed data, including valid values and NaN\n df = f_414(self.test_data_paths[4])\n self.assertListEqual(df.index.tolist(), [\"a\", \"b\", \"c\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 1.5)\n self.assertAlmostEqual(df.loc[\"a\", \"median\"], 1.5)\n self.assertTrue(np.isnan(df.loc[\"b\", \"mean\"]))\n self.assertTrue(np.isnan(df.loc[\"b\", \"median\"]))\n self.assertAlmostEqual(df.loc[\"c\", \"mean\"], 4.0)\n self.assertAlmostEqual(df.loc[\"c\", \"median\"], 4.0)\n def test_case_6(self):\n # Test with mixed types in values\n data = [{\"a\": 5, \"b\": \"text\", \"c\": 7}, {\"a\": \"more text\", \"b\": 4, \"c\": None}]\n path = self.temp_dir.name + \"/test_data_6.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n df = f_414(path)\n self.assertListEqual(df.index.tolist(), [\"a\", \"b\", \"c\"])\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 5.0)\n self.assertAlmostEqual(df.loc[\"c\", \"mean\"], 7.0)\n self.assertAlmostEqual(df.loc[\"b\", \"mean\"], 4.0)\n def test_case_7(self):\n # Test a larger dataset with missing values\n data = [{\"a\": i, \"b\": i * 2 if i % 2 == 0 else None} for i in range(1, 101)]\n path = self.temp_dir.name + \"/test_data_7.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n df = f_414(path)\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 50.5)\n self.assertAlmostEqual(\n df.loc[\"b\", \"mean\"], np.mean([2 * i for i in range(2, 101, 2)])\n )\n def test_case_8(self):\n # Test with all non-numeric values for a key\n data = [\n {\"a\": \"text\", \"b\": \"more text\"},\n {\"a\": \"even more text\", \"b\": \"still more text\"},\n ]\n path = self.temp_dir.name + \"/test_data_8.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n df = f_414(path)\n self.assertTrue(np.isnan(df.loc[\"a\", \"mean\"]))\n self.assertTrue(np.isnan(df.loc[\"b\", \"mean\"]))\n def test_case_9(self):\n # Test varying numbers of missing and non-numeric values\n data = [\n {\"a\": 10, \"b\": 20, \"c\": \"ignore\"},\n {\"a\": None, \"b\": 25, \"c\": 30},\n {\"a\": 5, \"b\": \"ignore\", \"c\": \"ignore\"},\n ]\n path = self.temp_dir.name + \"/test_data_9.json\"\n with open(path, \"w\") as f:\n json.dump(data, f)\n df = f_414(path)\n self.assertAlmostEqual(df.loc[\"a\", \"mean\"], 7.5)\n self.assertAlmostEqual(df.loc[\"b\", \"mean\"], 22.5)\n self.assertAlmostEqual(df.loc[\"c\", \"mean\"], 30.0)\n @classmethod\n def tearDownClass(cls):\n cls.temp_dir.cleanup()", "apis": ["collections.defaultdict", "pandas.DataFrame", "numpy.nanmean", "numpy.nanmedian", "numpy.nan", "json.load"], "libs": ["collections", "numpy", "pandas", "json"], "doc": {"description": ["Read a list of dictionaries from a JSON file, calculate the mean and median for each key", "(ignoring non-numeric or missing values), and convert the results into a Pandas DataFrame."], "note": [], "params": ["input_file (str, optional): The input JSON file name. Defaults to 'data.json'.", "The file should contain a list of dictionaries. If a key is", "missing in a dictionary, it is treated as NaN for that record.", "Non-numeric values are ignored for the calculation of mean", "and median. If all values for a key are non-numeric or missing,", "the statistics for that key will be NaN."], "returns": ["df (pd.DataFrame): A DataFrame indexed and sorted by the variable names (keys) from the", "input data, containing columns 'mean' and 'median'."], "reqs": ["numpy", "collections", "json", "pandas"], "raises": [], "example": [">>> df = f_414('data_1.json')", "a mean median", "b mean median", "c mean median"]}} +{"task_id": "f_890", "prompt": "from datetime import datetime\nimport pandas as pd\nfrom itertools import product\n\n# Constants\nEMPLOYEES = [\"John\", \"Alice\", \"Bob\", \"Charlie\", \"Dave\"]\n\n\ndef f_890(date_str):\n \"\"\"\n Generate a Pandas DataFrame containing a series of dates for a predefined list of employees.\n\n Parameters:\n - date_str (str): A date string in the \"yyyy-mm-dd\" format to define the starting date.\n\n Returns:\n - DataFrame: A pandas DataFrame with 'Employee' and 'Date' columns, listing the next 10 days for each employee.\n\n Requirements:\n - datetime.datetime\n - pandas\n - itertools\n\n Example:\n >>> df = f_890('2023-06-15')\n >>> print(df)\n Employee Date\n 0 John 2023-06-15\n 1 John 2023-06-16\n ...\n 49 Dave 2023-06-24\n \"\"\"", "canonical_solution": " start_date = datetime.strptime(date_str, \"%Y-%m-%d\")\n dates = pd.date_range(start_date, periods=10).tolist()\n\n # Creating a DataFrame from the product of EMPLOYEES and dates\n df = pd.DataFrame(list(product(EMPLOYEES, dates)), columns=[\"Employee\", \"Date\"])\n\n return df", "test": "import unittest\nimport pandas as pd\nfrom datetime import datetime, timedelta\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n def test_return_type(self):\n \"\"\"Test if the function returns a Pandas DataFrame.\"\"\"\n df_test = f_890(\"2023-01-01\")\n self.assertIsInstance(df_test, pd.DataFrame)\n def test_correct_columns(self):\n \"\"\"Test if the DataFrame has the correct columns: 'Employee' and 'Date'.\"\"\"\n df_test = f_890(\"2023-01-01\")\n self.assertListEqual(df_test.columns.tolist(), [\"Employee\", \"Date\"])\n def test_date_range(self):\n \"\"\"Test if the function generates the correct date range for 10 days.\"\"\"\n start_date = \"2023-01-01\"\n df_test = f_890(start_date)\n end_date = (\n datetime.strptime(start_date, \"%Y-%m-%d\") + timedelta(days=9)\n ).date()\n self.assertTrue(all(df_test[\"Date\"] <= pd.Timestamp(end_date)))\n def test_number_of_rows(self):\n \"\"\"Test if the DataFrame has the correct number of rows (10 days * number of employees).\"\"\"\n df_test = f_890(\"2023-01-01\")\n expected_rows = 10 * len(EMPLOYEES) # 10 days for each employee\n self.assertEqual(len(df_test), expected_rows)\n def test_leap_year(self):\n \"\"\"Test if the function correctly handles the date range for a leap year.\"\"\"\n df_test = f_890(\"2024-02-28\")\n leap_year_end_date = (\n datetime.strptime(\"2024-02-28\", \"%Y-%m-%d\") + timedelta(days=9)\n ).date()\n self.assertIn(pd.Timestamp(leap_year_end_date), df_test[\"Date\"].values)", "apis": ["pandas.DataFrame", "itertools.product", "pandas.date_range", "datetime.datetime.strptime"], "libs": ["itertools", "pandas", "datetime"], "doc": {"description": ["Generate a Pandas DataFrame containing a series of dates for a predefined list of employees."], "note": [], "params": ["date_str (str): A date string in the \"yyyy-mm-dd\" format to define the starting date."], "returns": ["DataFrame: A pandas DataFrame with 'Employee' and 'Date' columns, listing the next 10 days for each employee."], "reqs": ["datetime.datetime", "pandas", "itertools"], "raises": [], "example": [">>> df = f_890('2023-06-15')", ">>> print(df)", "Employee Date", "0 John 2023-06-15", "1 John 2023-06-16", "...", "49 Dave 2023-06-24"]}} +{"task_id": "f_402", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_402(array):\n \"\"\"\n Create a Pandas DataFrame from a 2D list and plot the sum of each column.\n\n Parameters:\n array (list of list of int): The 2D list representing the data.\n\n Returns:\n DataFrame, Axes: A pandas DataFrame with the data and a matplotlib Axes object showing the sum of each column.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Internal Constants:\n COLUMNS: List of column names used for the DataFrame ['A', 'B', 'C', 'D', 'E']\n\n Example:\n >>> df, ax = f_402([[1,2,3,4,5], [6,7,8,9,10]])\n >>> print(df)\n A B C D E\n 0 1 2 3 4 5\n 1 6 7 8 9 10\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " # Internal Constants\n COLUMNS = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n\n df = pd.DataFrame(array, columns=COLUMNS)\n sums = df.sum()\n\n fig, ax = plt.subplots()\n sums.plot(kind=\"bar\", ax=ax)\n\n return df, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df, ax = f_402([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.assertEqual(df.values.tolist(), [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.assertEqual(df.columns.tolist(), [\"A\", \"B\", \"C\", \"D\", \"E\"])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_2(self):\n df, ax = f_402(\n [[10, 20, 30, 40, 50], [15, 25, 35, 45, 55], [5, 15, 25, 35, 45]]\n )\n self.assertEqual(\n df.values.tolist(),\n [[10, 20, 30, 40, 50], [15, 25, 35, 45, 55], [5, 15, 25, 35, 45]],\n )\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_3(self):\n # Test handling uniform data\n df, ax = f_402([[1, 1, 1, 1, 1]])\n self.assertEqual(df.values.tolist(), [[1, 1, 1, 1, 1]])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_4(self):\n # Test handling all zero\n df, ax = f_402([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])\n self.assertEqual(df.values.tolist(), [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_5(self):\n # Handle negatives\n df, ax = f_402([[-1, -2, -3, -4, -5], [1, 2, 3, 4, 5]])\n self.assertEqual(df.values.tolist(), [[-1, -2, -3, -4, -5], [1, 2, 3, 4, 5]])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_6(self):\n # Handle empty\n df, ax = f_402([])\n self.assertEqual(df.values.tolist(), [])\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_7(self):\n # Handle invalid input\n with self.assertRaises(TypeError):\n f_402([[\"a\", \"b\", \"c\", \"d\", \"e\"]])\n def test_case_8(self):\n # Handle large numbers\n df, _ = f_402([[1000000, 2000000, 3000000, 4000000, 5000000]])\n self.assertTrue(\n all(\n df.sum()\n == pd.Series(\n [1000000, 2000000, 3000000, 4000000, 5000000],\n index=[\"A\", \"B\", \"C\", \"D\", \"E\"],\n )\n )\n )\n def test_case_9(self):\n # Test plot details\n _, ax = f_402([[1, 2, 3, 4, 5]])\n self.assertEqual(len(ax.patches), 5) # Checks if there are exactly 5 bars\n bar_labels = [bar.get_x() for bar in ax.patches]\n self.assertEqual(len(bar_labels), 5)\n def test_case_10(self):\n # Test column sums with plot check\n data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [2, 3, 4, 5, 6]]\n df, ax = f_402(data)\n column_sums = df.sum().tolist()\n bar_heights = [bar.get_height() for bar in ax.patches]\n self.assertEqual(column_sums, bar_heights)\n self.assertEqual(\n len(ax.patches), len(data[0])\n ) # Ensure there's a bar for each column\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Create a Pandas DataFrame from a 2D list and plot the sum of each column.", "Internal Constants:", "COLUMNS: List of column names used for the DataFrame ['A', 'B', 'C', 'D', 'E']"], "note": [], "params": ["array (list of list of int): The 2D list representing the data."], "returns": ["DataFrame, Axes: A pandas DataFrame with the data and a matplotlib Axes object showing the sum of each column."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> df, ax = f_402([[1,2,3,4,5], [6,7,8,9,10]])", ">>> print(df)", "A B C D E", "0 1 2 3 4 5", "1 6 7 8 9 10", ">>> type(ax)", ""]}} +{"task_id": "f_914", "prompt": "import pandas as pd\nfrom random import shuffle\n\n# Constants\nPOSSIBLE_VALUES = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\", \"G\", \"H\", \"I\", \"J\"]\n\n\ndef f_914(list_of_lists):\n \"\"\"\n Generate a list of pandas DataFrames, each created from a sublist in 'list_of_lists'.\n Each DataFrame has columns named as per the elements of the sublist, and each column\n is filled with randomly shuffled values from 'POSSIBLE_VALUES'.\n\n Parameters:\n - list_of_lists (list of list): A list where each element is a list of strings\n representing column names for a DataFrame.\n\n Returns:\n - list of pandas.DataFrame: A list where each element is a DataFrame with columns as specified\n in 'list_of_lists', and each column contains shuffled values from 'POSSIBLE_VALUES'.\n\n Requirements:\n - pandas\n - random.shuffle\n\n Note:\n - The length of each DataFrame's columns is equal to the length of 'POSSIBLE_VALUES'.\n - Each column in the DataFrame has the same shuffled order of 'POSSIBLE_VALUES'.\n\n Example:\n >>> import random\n >>> random.seed(0)\n >>> dfs = f_914([['x', 'y', 'z'], ['a', 'b', 'c']])\n >>> dfs[0].head()\n x y z\n 0 H J H\n 1 I E A\n 2 B I J\n 3 F G D\n 4 D A C\n \"\"\"", "canonical_solution": " dataframes = []\n\n for list_ in list_of_lists:\n df_dict = {col: POSSIBLE_VALUES.copy() for col in list_}\n for col in df_dict:\n shuffle(df_dict[col])\n df = pd.DataFrame(df_dict)\n dataframes.append(df)\n\n return dataframes", "test": "import unittest\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_914 function.\"\"\"\n def test_dataframe_count(self):\n \"\"\"Test number of dataframes returned.\"\"\"\n random.seed(0)\n input_data = [[\"x\", \"y\"], [\"a\", \"b\", \"c\"], [\"m\"]]\n dfs = f_914(input_data)\n self.assertEqual(len(dfs), len(input_data))\n def test_dataframe_columns(self):\n \"\"\"Test each dataframe has correct columns.\"\"\"\n random.seed(1)\n input_data = [[\"x\", \"y\"], [\"a\", \"b\", \"c\"], [\"m\"]]\n dfs = f_914(input_data)\n for idx, df in enumerate(dfs):\n self.assertListEqual(list(df.columns), input_data[idx])\n def test_dataframe_values(self):\n \"\"\"Test values in each dataframe column are from the POSSIBLE_VALUES list.\"\"\"\n random.seed(2)\n input_data = [[\"x\", \"y\"], [\"a\", \"b\", \"c\"], [\"m\"]]\n dfs = f_914(input_data)\n for df in dfs:\n for col in df.columns:\n self.assertTrue(all(val in POSSIBLE_VALUES for val in df[col].values))\n def test_empty_input(self):\n \"\"\"Test function with an empty list of lists.\"\"\"\n random.seed(3)\n dfs = f_914([])\n self.assertEqual(len(dfs), 0)\n def test_single_list_input(self):\n \"\"\"Test function with a single list input.\"\"\"\n random.seed(4)\n input_data = [[\"x\", \"y\", \"z\"]]\n dfs = f_914(input_data)\n self.assertEqual(len(dfs), 1)\n self.assertListEqual(list(dfs[0].columns), input_data[0])\n self.assertTrue(all(val in POSSIBLE_VALUES for val in dfs[0][\"x\"].values))\n self.assertTrue(all(val in POSSIBLE_VALUES for val in dfs[0][\"y\"].values))\n self.assertTrue(all(val in POSSIBLE_VALUES for val in dfs[0][\"z\"].values))", "apis": ["pandas.DataFrame", "random.shuffle"], "libs": ["random", "pandas"], "doc": {"description": ["Generate a list of pandas DataFrames, each created from a sublist in 'list_of_lists'.", "Each DataFrame has columns named as per the elements of the sublist, and each column", "is filled with randomly shuffled values from 'POSSIBLE_VALUES'."], "note": ["The length of each DataFrame's columns is equal to the length of 'POSSIBLE_VALUES'.", "Each column in the DataFrame has the same shuffled order of 'POSSIBLE_VALUES'."], "params": ["list_of_lists (list of list): A list where each element is a list of strings", "representing column names for a DataFrame."], "returns": ["list of pandas.DataFrame: A list where each element is a DataFrame with columns as specified", "in 'list_of_lists', and each column contains shuffled values from 'POSSIBLE_VALUES'."], "reqs": ["pandas", "random.shuffle"], "raises": [], "example": [">>> import random", ">>> random.seed(0)", ">>> dfs = f_914([['x', 'y', 'z'], ['a', 'b', 'c']])", ">>> dfs[0].head()", "x y z", "0 H J H", "1 I E A", "2 B I J", "3 F G D", "4 D A C"]}} +{"task_id": "f_773", "prompt": "from collections import defaultdict\nimport re\n\ndef f_773(word: str) -> dict:\n \"\"\"\n Find the occurrences of each two-letter combination in the sanitized word,\n where only alphabetic characters are considered.\n\n Requirements:\n - collections.defaultdict\n - re\n \n Parameters:\n word (str): The input string.\n\n Returns:\n collections.defaultdict: A dictionary with keys as two-letter combinations and values as their counts in the sanitized word.\n\n Example:\n >>> f_773('abcdef')\n defaultdict(, {'ab': 1, 'bc': 1, 'cd': 1, 'de': 1, 'ef': 1})\n >>> f_773('aabbcc')\n defaultdict(, {'aa': 1, 'ab': 1, 'bb': 1, 'bc': 1, 'cc': 1})\n >>> f_773('a1!b@c#d$')\n defaultdict(, {'ab': 1, 'bc': 1, 'cd': 1})\n \"\"\"", "canonical_solution": " # Sanitize the word to include only alphabetic characters\n sanitized_word = re.sub('[^A-Za-z]', '', word)\n occurrences = defaultdict(int)\n pairs = [''.join(x) for x in zip(sanitized_word, sanitized_word[1:])]\n\n for pair in pairs:\n occurrences[pair] += 1\n\n return occurrences", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n result = f_773('abcdef')\n expected = {'ab': 1, 'bc': 1, 'cd': 1, 'de': 1, 'ef': 1}\n self.assertEqual(result, expected)\n def test_case_2(self):\n result = f_773('aabbcc')\n expected = {'aa': 1, 'ab': 1, 'bb': 1, 'bc': 1, 'cc': 1}\n self.assertEqual(result, expected)\n def test_case_3(self):\n result = f_773('a')\n expected = {}\n self.assertEqual(result, expected)\n def test_case_4(self):\n result = f_773('')\n expected = {}\n self.assertEqual(result, expected)\n def test_case_5(self):\n result = f_773('AbCd')\n expected = {'Ab': 1, 'bC': 1, 'Cd': 1}\n self.assertEqual(result, expected)\n def test_case_6(self):\n # Test with non-alphabetic characters in the word\n result = f_773('a1!b@c#d$')\n expected = {'ab': 1, 'bc': 1, 'cd': 1}\n self.assertEqual(result, expected)\n def test_case_7(self):\n # Test with mixed case and non-alphabetic characters\n result = f_773('AaBb!!Cc123')\n expected = {'Aa': 1, 'aB': 1, 'Bb': 1, 'bC': 1, 'Cc': 1}\n self.assertEqual(result, expected)", "apis": ["re.sub", "collections.defaultdict"], "libs": ["re", "collections"], "doc": {"description": ["Find the occurrences of each two-letter combination in the sanitized word,", "where only alphabetic characters are considered."], "note": [], "params": ["word (str): The input string."], "returns": ["collections.defaultdict: A dictionary with keys as two-letter combinations and values as their counts in the sanitized word."], "reqs": ["collections.defaultdict", "re"], "raises": [], "example": [">>> f_773('abcdef')", "defaultdict(, {'ab': 1, 'bc': 1, 'cd': 1, 'de': 1, 'ef': 1})", ">>> f_773('aabbcc')", "defaultdict(, {'aa': 1, 'ab': 1, 'bb': 1, 'bc': 1, 'cc': 1})", ">>> f_773('a1!b@c#d$')", "defaultdict(, {'ab': 1, 'bc': 1, 'cd': 1})"]}} {"task_id": "f_587", "prompt": "import pandas as pd\nfrom sklearn.decomposition import PCA\n\ndef f_587(df):\n \"\"\"\n Perform Principal Component Analysis (PCA) on the DataFrame and record the first two main components.\n \n Parameters:\n - df (DataFrame): The pandas DataFrame.\n \n Returns:\n - df_pca (DataFrame): The DataFrame with the first two principal components named 'PC1' and 'PC2' as columns.\n\n Requirements:\n - pandas\n - sklearn\n \n Example:\n >>> df = pd.DataFrame([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], columns = ['x', 'y', 'z'])\n >>> df_pca = f_587(df)\n >>> print(df_pca)\n PC1 PC2\n 0 0.334781 -0.011992\n 1 -0.187649 -0.142630\n 2 -0.147132 0.154622\n \"\"\"", "canonical_solution": " pca = PCA(n_components=2)\n df_pca = pca.fit_transform(df)\n \n df_pca = pd.DataFrame(df_pca, columns=['PC1', 'PC2'])\n \n return df_pca", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame([[0, 0], [0, 0]], columns = ['x', 'y'])\n df_pca = f_587(df)\n self.assertTrue('PC1' in df_pca.columns)\n self.assertTrue('PC2' in df_pca.columns)\n self.assertEqual(df_pca.shape, (2, 2))\n self.assertEqual(df_pca['PC1'].iloc[0], 0)\n self.assertEqual(df_pca['PC2'].iloc[0], 0)\n self.assertEqual(df_pca['PC1'].iloc[1], 0)\n self.assertEqual(df_pca['PC2'].iloc[1], 0)\n def test_case_2(self):\n df = pd.DataFrame([[1, 1], [1, 1]], columns = ['x', 'y'])\n df_pca = f_587(df)\n self.assertTrue('PC1' in df_pca.columns)\n self.assertTrue('PC2' in df_pca.columns)\n self.assertEqual(df_pca.shape, (2, 2))\n self.assertEqual(df_pca['PC1'].iloc[0], 0)\n self.assertEqual(df_pca['PC2'].iloc[0], 0)\n self.assertEqual(df_pca['PC1'].iloc[1], 0)\n self.assertEqual(df_pca['PC2'].iloc[1], 0)\n def test_case_3(self):\n df = pd.DataFrame([[1, 0], [0, 1]], columns = ['x', 'y'])\n df_pca = f_587(df)\n self.assertTrue('PC1' in df_pca.columns)\n self.assertTrue('PC2' in df_pca.columns)\n self.assertEqual(df_pca.shape, (2, 2))\n pca_new = PCA(n_components=2)\n df_pca_new = pca_new.fit_transform(df)\n self.assertEqual(df_pca['PC1'].iloc[0], df_pca_new[0, 0])\n self.assertEqual(df_pca['PC2'].iloc[0], df_pca_new[0, 1])\n self.assertEqual(df_pca['PC1'].iloc[1], df_pca_new[1, 0])\n self.assertEqual(df_pca['PC2'].iloc[1], df_pca_new[1, 1])\n def test_case_4(self):\n df = pd.DataFrame([[4, 3, 2, 1], [1, 2, 3, 4]], columns = ['x', 'y', 'z', 'w'])\n df_pca = f_587(df)\n self.assertTrue('PC1' in df_pca.columns)\n self.assertTrue('PC2' in df_pca.columns)\n self.assertEqual(df_pca.shape, (2, 2))\n pca_new = PCA(n_components=2)\n df_pca_new = pca_new.fit_transform(df)\n self.assertEqual(df_pca['PC1'].iloc[0], df_pca_new[0, 0])\n def test_case_5(self):\n df = pd.DataFrame([[1, 2, 3, 4], [4, 3, 2, 1]], columns = ['x', 'y', 'z', 'w'])\n df_pca = f_587(df)\n self.assertTrue('PC1' in df_pca.columns)\n self.assertTrue('PC2' in df_pca.columns)\n self.assertEqual(df_pca.shape, (2, 2))\n pca_new = PCA(n_components=2)\n df_pca_new = pca_new.fit_transform(df)\n self.assertEqual(df_pca['PC1'].iloc[0], df_pca_new[0, 0])", "apis": ["pandas.DataFrame", "sklearn.decomposition.PCA"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Perform Principal Component Analysis (PCA) on the DataFrame and record the first two main components."], "note": [], "params": ["df (DataFrame): The pandas DataFrame."], "returns": ["df_pca (DataFrame): The DataFrame with the first two principal components named 'PC1' and 'PC2' as columns."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> df = pd.DataFrame([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], columns = ['x', 'y', 'z'])", ">>> df_pca = f_587(df)", ">>> print(df_pca)", "PC1 PC2", "0 0.334781 -0.011992", "1 -0.187649 -0.142630", "2 -0.147132 0.154622"]}} {"task_id": "f_909", "prompt": "from scipy import fftpack\nfrom matplotlib import pyplot as plt\n\n\ndef f_909(arr):\n \"\"\"\n Performs a Fast Fourier Transform (FFT) on the sum of each row in a 2D array and\n plots the absolute values of the FFT coefficients.\n\n Parameters:\n arr (numpy.ndarray): A 2D numpy array.\n\n Returns:\n matplotlib.axes.Axes: An Axes object displaying the plot of the absolute values of the FFT coefficients.\n\n Requirements:\n - scipy.fftpack\n - matplotlib.pyplot\n\n Example:\n >>> import numpy as np\n >>> arr = np.array([[i + j for i in range(3)] for j in range(5)])\n >>> ax = f_909(arr)\n >>> ax.get_title()\n 'Absolute values of FFT coefficients'\n \"\"\"", "canonical_solution": " row_sums = arr.sum(axis=1)\n fft_coefficients = fftpack.fft(row_sums)\n\n _, ax = plt.subplots()\n ax.plot(np.abs(fft_coefficients))\n ax.set_title(\"Absolute values of FFT coefficients\")\n\n return ax", "test": "import unittest\nimport numpy as np\nfrom scipy import fftpack\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_909.\"\"\"\n def test_plot_title(self):\n \"\"\"Test that the plot title is correct.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n ax = f_909(arr)\n self.assertEqual(ax.get_title(), \"Absolute values of FFT coefficients\")\n def test_plot_data(self):\n \"\"\"Test that the plot data is correct.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n ax = f_909(arr)\n y_data = ax.lines[0].get_ydata()\n row_sums = arr.sum(axis=1)\n fft_coefficients = fftpack.fft(row_sums)\n expected_y_data = np.abs(fft_coefficients)\n np.testing.assert_array_equal(y_data, expected_y_data)\n def test_with_zeros(self):\n \"\"\"Test that the plot data is correct when the array is all zeros.\"\"\"\n arr = np.zeros((5, 3))\n ax = f_909(arr)\n y_data = ax.lines[0].get_ydata()\n expected_y_data = np.zeros(5)\n np.testing.assert_array_equal(y_data, expected_y_data)\n def test_with_ones(self):\n \"\"\"Test that the plot data is correct when the array is all ones.\"\"\"\n arr = np.ones((5, 3))\n ax = f_909(arr)\n y_data = ax.lines[0].get_ydata()\n expected_y_data = [15.0, 0.0, 0.0, 0.0, 0.0]\n np.testing.assert_array_almost_equal(y_data, expected_y_data)\n def test_with_large_numbers(self):\n \"\"\"Test that the plot data is correct when the array has large numbers.\"\"\"\n arr = np.array([[i * 100 + j * 1000 for i in range(3)] for j in range(5)])\n ax = f_909(arr)\n y_data = ax.lines[0].get_ydata()\n row_sums = arr.sum(axis=1)\n fft_coefficients = fftpack.fft(row_sums)\n expected_y_data = np.abs(fft_coefficients)\n np.testing.assert_array_equal(y_data, expected_y_data)\n def tearDown(self):\n plt.close()", "apis": ["matplotlib.pyplot.subplots", "scipy.fftpack.fft"], "libs": ["matplotlib", "scipy"], "doc": {"description": ["Performs a Fast Fourier Transform (FFT) on the sum of each row in a 2D array and", "plots the absolute values of the FFT coefficients."], "note": [], "params": ["arr (numpy.ndarray): A 2D numpy array."], "returns": ["matplotlib.axes.Axes: An Axes object displaying the plot of the absolute values of the FFT coefficients."], "reqs": ["scipy.fftpack", "matplotlib.pyplot"], "raises": [], "example": [">>> import numpy as np", ">>> arr = np.array([[i + j for i in range(3)] for j in range(5)])", ">>> ax = f_909(arr)", ">>> ax.get_title()", "'Absolute values of FFT coefficients'"]}} -{"task_id": "f_805", "prompt": "import os\nfrom pathlib import Path\nimport glob\nimport shutil\n\n\ndef f_805(source_directory: str, target_directory: str):\n \"\"\"\n Moves files with specific extensions from a source directory to a target directory,\n handling naming conflicts by renaming duplicates.\n\n Parameters:\n - source_directory (str): The absolute or relative path of the source directory.\n - target_directory (str): The absolute or relative path of the target directory.\n This function will create it if it does not exist.\n\n Returns:\n - int: The number of files successfully moved.\n\n Raises:\n - FileNotFoundError: If source_directory does not exist.\n\n Requirements:\n - os\n - pathlib\n - glob\n - shutil\n\n Notes:\n - This function scans the source directory recursively to find files.\n - Files are filtered by the extensions: \".txt\", \".docx\", \".xlsx\", \".csv\".\n - Renaming of files due to naming conflicts follows the pattern '-n.'.\n\n Examples:\n >>> f_805('./source_folder', './target_folder')\n 3\n >>> f_805('./empty_folder', './target_folder')\n 0\n \"\"\"", "canonical_solution": " moved_files = 0\n\n if not os.path.exists(source_directory):\n raise FileNotFoundError(\"source_directory must exist.\")\n\n if not os.path.exists(target_directory):\n os.makedirs(target_directory)\n\n for extension in [\".txt\", \".docx\", \".xlsx\", \".csv\"]:\n filepaths = glob.glob(\n os.path.join(source_directory, \"**\", \"*\" + extension), recursive=True\n )\n for filepath in filepaths:\n filename = Path(filepath).name\n stem = Path(filepath).stem\n target_filepath = os.path.join(target_directory, filename)\n\n count = 1\n while os.path.exists(target_filepath):\n new_filename = f\"{stem}-{count}{extension}\"\n target_filepath = os.path.join(target_directory, new_filename)\n count += 1\n\n shutil.move(filepath, target_filepath)\n moved_files += 1\n\n return moved_files", "test": "import unittest\nimport tempfile\nfrom pathlib import Path\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.valid_extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n def test_case_1(self):\n # Test with an empty source directory\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result, 0, \"Should return 0 for an empty source directory.\"\n )\n def test_case_2(self):\n # Test with a source directory containing only files with no extensions\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n for i in range(3):\n Path(f\"{source_dir}/file_{i}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result, 0, \"Should return 0 for files with non-matching extensions.\"\n )\n def test_case_3(self):\n # Test with a source directory containing files with a mix of extensions\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n extensions = self.valid_extensions + [\".pdf\", \".jpg\"]\n for i, ext in enumerate(extensions):\n Path(f\"{source_dir}/file_{i}{ext}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertTrue(result == len(self.valid_extensions))\n def test_case_4(self):\n # Test with a source directory containing files with all matching extensions\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n for i, ext in enumerate(self.valid_extensions):\n Path(f\"{source_dir}/file_{i}{ext}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result, 4, \"Should return 4 for all files with matching extensions.\"\n )\n def test_case_5(self):\n # Test with a source directory containing nested directories with files\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n Path(f\"{source_dir}/subdir1\").mkdir()\n Path(f\"{source_dir}/subdir1/subdir2\").mkdir()\n for i, ext in enumerate(extensions):\n Path(f\"{source_dir}/file_{i}{ext}\").touch()\n Path(f\"{source_dir}/subdir1/file_{i}{ext}\").touch()\n Path(f\"{source_dir}/subdir1/subdir2/file_{i}{ext}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result,\n 12,\n \"Should return 12 for all files in nested directories with matching extensions.\",\n )\n def test_case_6(self):\n # Test files with the same name in different subdirectories of the source directory\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n Path(f\"{source_dir}/subdir1\").mkdir()\n Path(f\"{source_dir}/subdir2\").mkdir()\n extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n # Create files with the same name in different subdirectories\n for ext in extensions:\n (Path(f\"{source_dir}/subdir1\") / f\"file{ext}\").touch()\n (Path(f\"{source_dir}/subdir2\") / f\"file{ext}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result,\n 8,\n \"Should correctly move files with the same name from different source directories.\",\n )\n def test_case_7(self):\n # Test handling of invalid path inputs\n source_dir = \"/path/does/not/exist\"\n with tempfile.TemporaryDirectory() as target_dir:\n with self.assertRaises(FileNotFoundError):\n f_805(source_dir, target_dir)\n def test_case_8(self):\n # Test file renaming when handling duplicate files\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n extensions = self.valid_extensions\n for i, ext in enumerate(extensions):\n filename = f\"file_{i}{ext}\"\n # Create duplicate files in the source directory\n Path(os.path.join(source_dir, filename)).touch()\n # Create expected duplicate files in the target directory to force renaming\n Path(os.path.join(target_dir, filename)).touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(result, len(extensions), \"Should have moved all files.\")\n # Check if files were renamed correctly to avoid overwriting\n expected_files = [f\"file_{i}-1{ext}\" for i, ext in enumerate(extensions)]\n actual_files = [Path(f).name for f in glob.glob(f\"{target_dir}/*\")]\n for expected_file in expected_files:\n self.assertIn(\n expected_file,\n actual_files,\n f\"{expected_file} was not found in target directory.\",\n )", "apis": ["os.path.exists", "shutil.move", "os.path", "pathlib.Path", "os.makedirs", "os.path.join", "glob.glob"], "libs": ["glob", "shutil", "os", "pathlib"], "doc": {"description": ["Moves files with specific extensions from a source directory to a target directory,", "handling naming conflicts by renaming duplicates.", "Notes:", "- This function scans the source directory recursively to find files.", "- Files are filtered by the extensions: \".txt\", \".docx\", \".xlsx\", \".csv\".", "- Renaming of files due to naming conflicts follows the pattern '-n.'."], "note": [], "params": ["source_directory (str): The absolute or relative path of the source directory.", "target_directory (str): The absolute or relative path of the target directory.", "This function will create it if it does not exist."], "returns": ["int: The number of files successfully moved."], "reqs": ["os", "pathlib", "glob", "shutil"], "raises": ["FileNotFoundError: If source_directory does not exist."], "example": ["Examples:", ">>> f_805('./source_folder', './target_folder')", "3", ">>> f_805('./empty_folder', './target_folder')", "0"]}} -{"task_id": "f_811", "prompt": "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_811(df):\n \"\"\"\n Creates and return a heatmap of the cumulative sum of each column in a pandas DataFrame.\n\n Parameters:\n - df (pandas.DataFrame): A DataFrame with numerical values.\n\n Returns:\n - matplotlib.axes._subplots.AxesSubplot: The AxesSubplot object of the Seaborn heatmap.\n\n Raises:\n - ValueError: If the DataFrame is empty or if no numeric columns are present.\n\n Requirements:\n - pandas\n - matplotlib\n - seaborn\n\n Notes:\n - Only numeric columns are considered for the heatmap. Non-numeric columns are ignored.\n\n Example:\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n >>> ax = f_811(df)\n \"\"\"", "canonical_solution": " numeric_df = df.select_dtypes(include=[\"number\"])\n if numeric_df.empty:\n raise ValueError(\"No numeric columns present\")\n\n df_cumsum = numeric_df.cumsum()\n ax = sns.heatmap(df_cumsum)\n return ax", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n plt.close(\"all\")\n def test_cumsum_correctness(self):\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n ax = f_811(df)\n result_cumsum = df.cumsum().values.flatten()\n heatmap_data = ax.collections[0].get_array().data.flatten()\n np.testing.assert_array_equal(\n result_cumsum, heatmap_data, \"Cumulative sum calculation is incorrect\"\n )\n def test_non_numeric_columns_ignored(self):\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [\"one\", \"two\", \"three\"]})\n ax = f_811(df)\n self.assertIsInstance(\n ax, plt.Axes, \"The result should be a matplotlib AxesSubplot object\"\n )\n self.assertEqual(\n len(ax.get_xticklabels()), 1, \"Non-numeric columns should be ignored\"\n )\n def test_with_positive_numbers(self):\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n result = f_811(df)\n self.assertIsInstance(\n result, plt.Axes, \"The result should be a matplotlib AxesSubplot object\"\n )\n def test_with_negative_numbers(self):\n df = pd.DataFrame({\"A\": [-1, -2, -3], \"B\": [-4, -5, -6]})\n result = f_811(df)\n self.assertIsInstance(\n result, plt.Axes, \"The result should be a matplotlib AxesSubplot object\"\n )\n def test_with_mixed_numbers(self):\n df = pd.DataFrame({\"A\": [1, -2, 3], \"B\": [-4, 5, -6]})\n result = f_811(df)\n self.assertIsInstance(\n result, plt.Axes, \"The result should be a matplotlib AxesSubplot object\"\n )\n def test_with_zeroes(self):\n df = pd.DataFrame({\"A\": [0, 0, 0], \"B\": [0, 0, 0]})\n result = f_811(df)\n self.assertIsInstance(\n result, plt.Axes, \"The result should be a matplotlib AxesSubplot object\"\n )\n def test_with_empty_dataframe(self):\n df = pd.DataFrame({\"A\": [], \"B\": []})\n with self.assertRaises(ValueError):\n f_811(df)\n def test_no_numeric_columns(self):\n df = pd.DataFrame({\"A\": [\"one\", \"two\", \"three\"], \"B\": [\"four\", \"five\", \"six\"]})\n with self.assertRaises(ValueError):\n f_811(df)", "apis": ["seaborn.heatmap"], "libs": ["seaborn"], "doc": {"description": ["Creates and return a heatmap of the cumulative sum of each column in a pandas DataFrame.", "Notes:", "- Only numeric columns are considered for the heatmap. Non-numeric columns are ignored."], "note": [], "params": ["df (pandas.DataFrame): A DataFrame with numerical values."], "returns": ["matplotlib.axes._subplots.AxesSubplot: The AxesSubplot object of the Seaborn heatmap."], "reqs": ["pandas", "matplotlib", "seaborn"], "raises": ["ValueError: If the DataFrame is empty or if no numeric columns are present."], "example": [">>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})", ">>> ax = f_811(df)"]}} -{"task_id": "f_875", "prompt": "import matplotlib.pyplot as plt\nimport random\nimport string\nimport pandas as pd\nimport seaborn as sns\n\n# Constants\nLETTERS = list(string.ascii_lowercase)\n\n\ndef f_875(rows=1000, string_length=3):\n \"\"\"\n Generate a dataframe of random strings and create a heatmap showing the correlation\n in the frequency of each letter in these strings.\n\n This function generates a specified number of random strings, each of a given length,\n and calculates the frequency of each letter in these strings. A heatmap of the \n correlation matrix is then displayed, showing the co-occurrence frequencies of different \n letters within these strings.\n\n If the number of rows specified is zero, the function will print a message indicating\n that no data is available to generate the heatmap and will return None. Otherwise, \n it processes the DataFrame to convert the generated strings into a one-hot encoded format\n and then sums up these encodings to calculate the frequency of each letter.\n\n Parameters:\n - rows (int, optional): Number of random strings to generate. Must be non-negative. \n Default is 1000. If set to 0, the function returns None after printing a message.\n - string_length (int, optional): Length of each random string. Must be non-negative. \n Default is 3. A value of 0 results in the generation of empty strings.\n\n Returns:\n - matplotlib.axes._subplots.AxesSubplot or None: A seaborn heatmap plot object if \n data is generated; otherwise, None.\n\n Requirements:\n - random\n - string\n - pandas\n - seaborn\n - matplotlib\n\n Note\n - If no strings are generated (e.g., rows = 0), the \n DataFrame will be empty. In this case, the function prints a message \"No data to generate heatmap.\" and returns None.\n - If the DataFrame is not empty, each string is split into its \n constituent letters, converted into one-hot encoded format, and then the frequency \n of each letter is calculated by summing these encodings.\n \n Example:\n >>> ax = f_875(1000, 3)\n >>> ax.get_xlim()\n (0.0, 26.0)\n \"\"\"", "canonical_solution": "\n # Generate random strings\n data = [\"\".join(random.choices(LETTERS, k=string_length)) for _ in range(rows)]\n\n # Create a DataFrame and compute letter frequency\n df = pd.DataFrame({\"String\": data})\n\n # Check if the DataFrame is empty\n if df.empty:\n print(\"No data to generate heatmap.\")\n return None\n\n df = pd.get_dummies(df[\"String\"].apply(list).explode()).groupby(level=0).sum()\n\n # Calculate the correlation matrix\n corr = df.corr()\n\n # Create and return the heatmap\n ax = sns.heatmap(corr, annot=True, fmt=\".2f\")\n plt.close() # Close the plot to prevent it from showing during function call\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_875.\"\"\"\n def test_default_parameters(self):\n \"\"\"\n Test f_875 with default parameters (rows=1000, string_length=3).\n Verifies if the function returns a matplotlib Axes object.\n \"\"\"\n random.seed(0)\n result = f_875()\n self.assertIsInstance(result, plt.Axes)\n def test_custom_rows(self):\n \"\"\"\n Test f_875 with a custom number of rows.\n Verifies if the function still returns a matplotlib Axes object.\n \"\"\"\n random.seed(1)\n result = f_875(rows=500)\n self.assertIsInstance(result, plt.Axes)\n def test_custom_string_length(self):\n \"\"\"\n Test f_875 with a custom string length.\n Verifies if the function still returns a matplotlib Axes object.\n \"\"\"\n random.seed(2)\n result = f_875(string_length=5)\n self.assertIsInstance(result, plt.Axes)\n def test_large_dataset(self):\n \"\"\"\n Test f_875 with a large dataset.\n Verifies if the function can handle a large number of rows without errors.\n \"\"\"\n random.seed(3)\n result = f_875(rows=10000, string_length=3)\n self.assertIsInstance(result, plt.Axes)\n def test_zero_rows(self):\n \"\"\"\n Test f_875 with zero rows.\n Verifies if the function handles edge case of zero rows by returning None.\n \"\"\"\n random.seed(4)\n result = f_875(rows=0)\n self.assertIsNone(result, \"Function should return None for zero rows.\")\n def tearDown(self):\n plt.close()", "apis": ["string.ascii_lowercase", "pandas.get_dummies", "seaborn.heatmap", "matplotlib.pyplot.close", "random.choices", "pandas.DataFrame"], "libs": ["string", "random", "matplotlib", "pandas", "seaborn"], "doc": {"description": ["Generate a dataframe of random strings and create a heatmap showing the correlation", "in the frequency of each letter in these strings.", "This function generates a specified number of random strings, each of a given length,", "and calculates the frequency of each letter in these strings. A heatmap of the", "correlation matrix is then displayed, showing the co-occurrence frequencies of different", "letters within these strings.", "If the number of rows specified is zero, the function will print a message indicating", "that no data is available to generate the heatmap and will return None. Otherwise,", "it processes the DataFrame to convert the generated strings into a one-hot encoded format", "and then sums up these encodings to calculate the frequency of each letter.", "Note", "- If no strings are generated (e.g., rows = 0), the", "DataFrame will be empty. In this case, the function prints a message \"No data to generate heatmap.\" and returns None.", "- If the DataFrame is not empty, each string is split into its", "constituent letters, converted into one-hot encoded format, and then the frequency", "of each letter is calculated by summing these encodings."], "note": [], "params": ["rows (int, optional): Number of random strings to generate. Must be non-negative.", "Default is 1000. If set to 0, the function returns None after printing a message.", "string_length (int, optional): Length of each random string. Must be non-negative.", "Default is 3. A value of 0 results in the generation of empty strings."], "returns": ["matplotlib.axes._subplots.AxesSubplot or None: A seaborn heatmap plot object if", "data is generated; otherwise, None."], "reqs": ["random", "string", "pandas", "seaborn", "matplotlib"], "raises": [], "example": [">>> ax = f_875(1000, 3)", ">>> ax.get_xlim()", "(0.0, 26.0)"]}} -{"task_id": "f_425", "prompt": "import sqlite3\nfrom random import choice, seed\nimport os\n\n\ndef f_425(db_name, table_name, num_entries, random_seed=None):\n \"\"\"\n Create an SQLite3 table and fill it with random data using the provided database and table names.\n\n The function populates the table with columns 'name', 'age', 'height' using random data from the\n following constants:\n - NAMES: List of names ['John', 'Jane', 'Steve', 'Emma', 'Liam', 'Olivia']\n - AGES: Range of ages from 18 to 65.\n - HEIGHTS: Range of heights from 150cm to 200cm.\n\n Parameters:\n db_name (str): The name of the SQLite3 database.\n table_name (str): The name of the table to create and populate.\n num_entries (int): The number of entries to insert. Must not be negative.\n random_seed (int, optional): The seed for generating random values. Default is None.\n\n Returns:\n str: The absolute path of the SQLite3 database file.\n\n Requirements:\n - sqlite3\n - random.choice\n - random.seed\n - os\n\n Example:\n >>> db_path = f_425('test.db', 'People', 100, random_seed=42)\n >>> print(db_path)\n '/absolute/path/to/test.db'\n \"\"\"", "canonical_solution": " NAMES = [\"John\", \"Jane\", \"Steve\", \"Emma\", \"Liam\", \"Olivia\"]\n AGES = range(18, 65)\n HEIGHTS = range(150, 200)\n\n if random_seed:\n seed(random_seed)\n\n if num_entries < 0:\n raise ValueError(\"num_entries must not be negative\")\n\n conn = sqlite3.connect(db_name)\n cur = conn.cursor()\n cur.execute(f\"CREATE TABLE {table_name} (name TEXT, age INTEGER, height INTEGER)\")\n\n for _ in range(num_entries):\n name = choice(NAMES)\n age = choice(AGES)\n height = choice(HEIGHTS)\n cur.execute(f\"INSERT INTO {table_name} VALUES (?, ?, ?)\", (name, age, height))\n\n conn.commit()\n return os.path.abspath(db_name)", "test": "import unittest\nimport sqlite3\nimport os\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.temp_dir_path = self.temp_dir.name\n self.db_name = \"test_function.db\"\n self.db_path = os.path.join(self.temp_dir_path, self.db_name)\n self.table_name = \"TestTable\"\n self.random_seed = 42\n def tearDown(self):\n self.temp_dir.cleanup()\n def test_case_1(self):\n # Test basic case\n num_entries = 5\n db_path = f_425(\n self.db_path, self.table_name, num_entries, random_seed=self.random_seed\n )\n self.assertTrue(os.path.exists(db_path))\n self.verify_db_content(num_entries)\n def test_case_2(self):\n # Test handling 0 entries\n num_entries = 0\n db_path = f_425(\n self.db_path, self.table_name, num_entries, random_seed=self.random_seed\n )\n self.assertTrue(os.path.exists(db_path))\n self.verify_db_content(num_entries)\n def test_case_3(self):\n # Test handling 1 entry\n num_entries = 1\n db_path = f_425(\n self.db_path, self.table_name, num_entries, random_seed=self.random_seed\n )\n self.assertTrue(os.path.exists(db_path))\n self.verify_db_content(num_entries)\n def test_case_4(self):\n # Test handling invalid num_entries\n with self.assertRaises(Exception):\n f_425(self.db_path, self.table_name, -1, random_seed=self.random_seed)\n with self.assertRaises(Exception):\n f_425(self.db_path, self.table_name, \"1\", random_seed=self.random_seed)\n def test_case_5(self):\n # Test invalid table names (SQL keywords)\n with self.assertRaises(sqlite3.OperationalError):\n f_425(self.db_path, \"Select\", 10)\n def test_case_6(self):\n # Test against SQL injection in table_name parameter\n malicious_name = \"Test; DROP TABLE IntegrityCheck;\"\n with self.assertRaises(sqlite3.OperationalError):\n f_425(self.db_path, malicious_name, 1)\n def verify_db_content(self, num_entries):\n # Connect to the database and check if the table has correct number of entries\n conn = sqlite3.connect(self.db_path)\n cur = conn.cursor()\n cur.execute(f\"SELECT COUNT(*) FROM {self.table_name}\")\n count = cur.fetchone()[0]\n self.assertEqual(count, num_entries)\n # Verify data integrity\n cur.execute(f\"SELECT name, age, height FROM {self.table_name}\")\n rows = cur.fetchall()\n for row in rows:\n self.assertIn(row[0], [\"John\", \"Jane\", \"Steve\", \"Emma\", \"Liam\", \"Olivia\"])\n self.assertIn(row[1], list(range(18, 65)))\n self.assertIn(row[2], list(range(150, 200)))", "apis": ["random.seed", "random.choice", "os.path", "os.path.abspath", "sqlite3.connect"], "libs": ["sqlite3", "random", "os"], "doc": {"description": ["Create an SQLite3 table and fill it with random data using the provided database and table names.", "The function populates the table with columns 'name', 'age', 'height' using random data from the", "following constants:", "- NAMES: List of names ['John', 'Jane', 'Steve', 'Emma', 'Liam', 'Olivia']", "- AGES: Range of ages from 18 to 65.", "- HEIGHTS: Range of heights from 150cm to 200cm."], "note": [], "params": ["db_name (str): The name of the SQLite3 database.", "table_name (str): The name of the table to create and populate.", "num_entries (int): The number of entries to insert. Must not be negative.", "random_seed (int, optional): The seed for generating random values. Default is None."], "returns": ["str: The absolute path of the SQLite3 database file."], "reqs": ["sqlite3", "random.choice", "random.seed", "os"], "raises": [], "example": [">>> db_path = f_425('test.db', 'People', 100, random_seed=42)", ">>> print(db_path)", "'/absolute/path/to/test.db'"]}} -{"task_id": "f_408", "prompt": "import collections\nimport matplotlib.pyplot as plt\n\n\ndef f_408(data):\n \"\"\"\n Combine a list of dictionaries with the same keys (fruit names) into a single dictionary,\n calculate the total turnover for each fruit, and return a bar chart's axes with colors representing\n different fruits. The colors are selected from: 'red', 'yellow', 'green', 'blue', 'purple'. The function\n ensures that sales quantity must not be negative, throwing a ValueError if encountered.\n\n Parameters:\n data (list): A list of dictionaries. The keys are fruit names and the values are sales quantities.\n Sales quantity must not be negative.\n\n Returns:\n total_sales (dict): A dictionary containing the total sales for each fruit.\n ax (matplotlib.container.BarContainer): A bar chart of total fruit sales, or None if data is empty\n\n Requirements:\n - collections\n - matplotlib.pyplot\n\n Example:\n >>> sales, plot = f_408([{'apple': 10, 'banana': 15, 'cherry': 12},\\\n {'apple': 12, 'banana': 20, 'cherry': 14},\\\n {'apple': 15, 'banana': 18, 'cherry': 15},\\\n {'apple': 11, 'banana': 17, 'cherry': 13}])\n >>> sales\n {'apple': 48, 'banana': 70, 'cherry': 54}\n >>> type(plot)\n \n \"\"\"", "canonical_solution": " if not data:\n return dict(), None\n\n all_keys = set().union(*data)\n for d in data:\n for k, v in d.items():\n if v < 0:\n raise ValueError(\"Sales quantity must not be negative.\")\n\n combined_dict = dict((k, [d.get(k, 0) for d in data]) for k in all_keys)\n total_sales = {k: sum(v) for k, v in combined_dict.items()}\n total_sales = dict(collections.OrderedDict(sorted(total_sales.items())))\n labels, values = zip(*total_sales.items())\n\n # Define colors dynamically to handle different numbers of fruit types\n colors = [\"red\", \"yellow\", \"green\", \"blue\", \"purple\"] * (len(labels) // 5 + 1)\n\n ax = plt.bar(labels, values, color=colors[: len(labels)])\n plt.xlabel(\"Fruit\")\n plt.ylabel(\"Total Sales\")\n plt.title(\"Total Fruit Sales\")\n\n return total_sales, ax", "test": "import unittest\nimport collections\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case with one fruit\n data = [{\"apple\": 5}, {\"apple\": 7}, {\"apple\": 3}]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 15}\n self.assertDictEqual(sales, expected_sales)\n def test_case_2(self):\n # Test basic case with multiple fruits\n data = [\n {\"apple\": 10, \"banana\": 15, \"cherry\": 12, \"date\": 10},\n {\"apple\": 12, \"banana\": 20, \"cherry\": 14, \"date\": 9},\n {\"apple\": 15, \"banana\": 18, \"cherry\": 15, \"date\": 8},\n {\"apple\": 11, \"banana\": 17, \"cherry\": 13, \"date\": 7},\n ]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 48, \"banana\": 70, \"cherry\": 54, \"date\": 34}\n self.assertDictEqual(sales, expected_sales)\n def test_case_3(self):\n # Test basic case with one entry per fruit\n data = [{\"apple\": 1}, {\"banana\": 2}, {\"cherry\": 3}]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 1, \"banana\": 2, \"cherry\": 3}\n self.assertDictEqual(sales, expected_sales)\n def test_case_4(self):\n # Test zero quantities\n data = [\n {\"apple\": 0, \"banana\": 0},\n {\"apple\": 0, \"banana\": 0},\n {\"apple\": 0, \"banana\": 0},\n ]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 0, \"banana\": 0}\n self.assertDictEqual(sales, expected_sales)\n def test_case_5(self):\n # Test empty data\n data = []\n sales, _ = f_408(data)\n expected_sales = {}\n self.assertDictEqual(sales, expected_sales)\n def test_case_6(self):\n # Test missing fruit\n data = [{\"apple\": 10, \"banana\": 5}, {\"banana\": 15, \"cherry\": 7}, {\"cherry\": 3}]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 10, \"banana\": 20, \"cherry\": 10}\n self.assertDictEqual(sales, expected_sales)\n def test_case_7(self):\n # Test negative sales\n data = [{\"apple\": -10, \"banana\": 15}, {\"apple\": 12, \"banana\": -20}]\n with self.assertRaises(ValueError):\n f_408(data)\n def test_case_8(self):\n # Test large values\n data = [\n {\"apple\": 1000000, \"banana\": 500000},\n {\"apple\": 2000000, \"banana\": 1500000},\n ]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 3000000, \"banana\": 2000000}\n self.assertDictEqual(sales, expected_sales)\n def test_case_9(self):\n # Test visualization\n data = [{\"apple\": 10, \"banana\": 15}, {\"banana\": 5, \"apple\": 10}]\n _, plot = f_408(data)\n self.assertEqual(\n len(plot.patches), 2\n ) # Checking if the number of bars in the plot is correct\n def test_case_10(self):\n # Test non-string keys\n data = [{5: 10, \"banana\": 15}, {\"banana\": 5, 5: 10}]\n with self.assertRaises(TypeError):\n f_408(data)\n def test_case_11(self):\n # Test mixed types in sales\n data = [{\"apple\": 10.5, \"banana\": 15}, {\"apple\": 12, \"banana\": 20.5}]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 22.5, \"banana\": 35.5}\n self.assertDictEqual(sales, expected_sales)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.xlabel", "collections.OrderedDict", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.bar"], "libs": ["matplotlib", "collections"], "doc": {"description": ["Combine a list of dictionaries with the same keys (fruit names) into a single dictionary,", "calculate the total turnover for each fruit, and return a bar chart's axes with colors representing", "different fruits. The colors are selected from: 'red', 'yellow', 'green', 'blue', 'purple'. The function", "ensures that sales quantity must not be negative, throwing a ValueError if encountered."], "note": [], "params": ["data (list): A list of dictionaries. The keys are fruit names and the values are sales quantities.", "Sales quantity must not be negative."], "returns": ["total_sales (dict): A dictionary containing the total sales for each fruit.", "ax (matplotlib.container.BarContainer): A bar chart of total fruit sales, or None if data is empty"], "reqs": ["collections", "matplotlib.pyplot"], "raises": [], "example": [">>> sales, plot = f_408([{'apple': 10, 'banana': 15, 'cherry': 12},\\", "{'apple': 12, 'banana': 20, 'cherry': 14},\\", "{'apple': 15, 'banana': 18, 'cherry': 15},\\", "{'apple': 11, 'banana': 17, 'cherry': 13}])", ">>> sales", "{'apple': 48, 'banana': 70, 'cherry': 54}", ">>> type(plot)", ""]}} -{"task_id": "f_332", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_332(data):\n \"\"\"Scales numeric columns of a data dictionary using the StandardScaler.\n\n This function scales the numeric columns of a dataframe using the StandardScaler from scikit-learn.\n Non-numeric columns remain unchanged. If a column contains mixed data types, it tries to convert the entire column\n to float. If any value in the column cannot be converted to float, the entire column is left unchanged.\n\n Requirements:\n - pandas\n - sklearn.preprocessing.StandardScaler\n \n Parameters:\n - data (dict): Input data.\n\n Returns:\n - pd.DataFrame: Dataframe with scaled numeric columns.\n\n Example:\n >>> result = f_332({'x': [10, 20, 30, 40]})\n >>> result\n x\n 0 -1.341641\n 1 -0.447214\n 2 0.447214\n 3 1.341641\n >>> result2 = f_332({'a': [10.5, 23.4, 15.6, 78.9],'b': [45.6, 67.8, 89.0, 12.3],'c': ['apple', 'banana', 'cherry', 'date']})\n >>> result2\n a b c\n 0 -0.788098 -0.284409 apple\n 1 -0.317428 0.497496 banana\n 2 -0.602019 1.244180 cherry\n 3 1.707546 -1.457267 date\n \"\"\"", "canonical_solution": " dataframe = pd.DataFrame(data)\n # Initialize the scaler\n scaler = StandardScaler()\n\n # Iterate over columns and scale if they are numeric\n for column in dataframe.columns:\n if dataframe[column].dtype in [\"float64\", \"int64\"]:\n dataframe[column] = scaler.fit_transform(\n dataframe[column].values.reshape(-1, 1)\n )\n else:\n # Attempt to convert the entire column to float and then scale\n converted_column = dataframe[column].apply(pd.to_numeric, errors=\"coerce\")\n if (\n not converted_column.isna().all()\n ): # If all values are convertible to float\n dataframe[column] = scaler.fit_transform(\n converted_column.values.reshape(-1, 1)\n )\n return dataframe", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n \"\"\"Test the correctness of the scaling applied by the function.\"\"\"\n # Creating a sample dataframe with three numeric columns\n data = {\n \"a\": [10.5, 23.4, 15.6, 78.9],\n \"b\": [45.6, 67.8, 89.0, 12.3],\n \"c\": [12.3, 45.6, 78.9, 0.1],\n }\n df = pd.DataFrame(\n data\n )\n result = f_332(data)\n # Checking if the mean of scaled columns is approximately 0 and standard deviation is approximately 1\n self.assertTrue(np.isclose(result[\"a\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(result[\"b\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(np.std(result[\"a\"]), 1, atol=1e-2))\n self.assertTrue(np.isclose(np.std(result[\"b\"]), 1, atol=1e-2))\n def test_case_2(self):\n \"\"\"Test with an empty DataFrame.\"\"\"\n # Creating an empty dataframe\n data = {}\n df = pd.DataFrame(data)\n result = f_332(data)\n # Ensuring the result is also an empty dataframe\n self.assertTrue(result.empty)\n def test_case_3(self):\n \"\"\"Test with a DataFrame that doesn't have any columns to scale.\"\"\"\n # Creating a dataframe with a single non-numeric column\n data = {\"c\": [\"foo\", \"bar\"]}\n df = pd.DataFrame(data)\n result = f_332(data)\n # Ensuring the output dataframe is unchanged\n pd.testing.assert_frame_equal(result, df, check_dtype=False)\n def test_case_4(self):\n \"\"\"Test with a DataFrame where all columns are to be scaled.\"\"\"\n # Creating a dataframe with two numeric columns\n data = {\"a\": [10.5, 23.4, 15.6, 78.9], \"b\": [45.6, 67.8, 89.0, 12.3]}\n df = pd.DataFrame(\n data\n )\n result = f_332(data)\n # Checking if the mean of scaled columns is approximately 0 and standard deviation is approximately 1\n self.assertTrue(np.isclose(result[\"a\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(result[\"b\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(np.std(result[\"a\"]), 1, atol=1e-2))\n self.assertTrue(np.isclose(np.std(result[\"b\"]), 1, atol=1e-2))\n def test_case_5(self):\n \"\"\"Test with a DataFrame with single rows.\"\"\"\n # Creating a dataframe with a single row and three columns\n data = {\"a\": [5.5], \"b\": [8.6], \"c\": [7.7]}\n df = pd.DataFrame(data)\n result = f_332(data)\n self.assertDictEqual(result.to_dict(), {'a': {0: 0.0}, 'b': {0: 0.0}, 'c': {0: 0.0}})\n def test_case_6(self):\n \"\"\"Test with a DataFrame with mixed datatypes.\"\"\"\n # Creating a dataframe with mixed data types (both floats and strings) in columns\n data = {\n \"a\": [10.5, 23.4, 15.6, \"78.9\"],\n \"b\": [45.6, \"67.8\", 89.0, 12.3],\n \"c\": [12.3, 45.6, 78.9, \"0.1\"],\n }\n df = pd.DataFrame(\n data\n )\n result = f_332(data)\n # Checking if the mean of scaled columns is approximately 0 and standard deviation is approximately 1\n self.assertTrue(np.isclose(result[\"a\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(result[\"b\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(np.std(result[\"a\"]), 1, atol=1e-2))\n self.assertTrue(np.isclose(np.std(result[\"b\"]), 1, atol=1e-2))\n def test_case_7(self):\n \"\"\"Test with a DataFrame with negative values.\"\"\"\n # Creating a dataframe with negative values in columns\n data = {\"a\": [-1, -2, -3, -4], \"b\": [-4, -5, -6, -7], \"c\": [-7, -8, -9, -10]}\n df = pd.DataFrame(\n data\n )\n result = f_332(data)\n # Checking if the mean of scaled columns is approximately 0 and standard deviation is approximately 1\n self.assertTrue(np.isclose(result[\"a\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(result[\"b\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(np.std(result[\"a\"]), 1, atol=1e-2))\n self.assertTrue(np.isclose(np.std(result[\"b\"]), 1, atol=1e-2))", "apis": ["pandas.to_numeric", "pandas.DataFrame", "sklearn.preprocessing.StandardScaler"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Scales numeric columns of a data dictionary using the StandardScaler.", "This function scales the numeric columns of a dataframe using the StandardScaler from scikit-learn.", "Non-numeric columns remain unchanged. If a column contains mixed data types, it tries to convert the entire column", "to float. If any value in the column cannot be converted to float, the entire column is left unchanged."], "note": [], "params": ["data (dict): Input data."], "returns": ["pd.DataFrame: Dataframe with scaled numeric columns."], "reqs": ["pandas", "sklearn.preprocessing.StandardScaler"], "raises": [], "example": [">>> result = f_332({'x': [10, 20, 30, 40]})", ">>> result", "x", "0 -1.341641", "1 -0.447214", "2 0.447214", "3 1.341641", ">>> result2 = f_332({'a': [10.5, 23.4, 15.6, 78.9],'b': [45.6, 67.8, 89.0, 12.3],'c': ['apple', 'banana', 'cherry', 'date']})", ">>> result2", "a b c", "0 -0.788098 -0.284409 apple", "1 -0.317428 0.497496 banana", "2 -0.602019 1.244180 cherry", "3 1.707546 -1.457267 date"]}} -{"task_id": "f_389", "prompt": "import pandas as pd\nfrom datetime import datetime\nimport random\n\n\ndef f_389(\n epoch_milliseconds,\n random_seed=0,\n products=[\"Product1\", \"Product2\", \"Product3\", \"Product4\", \"Product5\"],\n):\n \"\"\"\n Generate sales data for five products from a given epoch time up to the current time.\n\n This function checks input validity, then for each day between the date of the given epoch\n time to the date of the current time, generates random sales data for each of the 5 products.\n\n Parameters:\n - epoch_milliseconds (int): Start epoch time in milliseconds. Must be before current system time.\n - random_seed (int): Seed for reproducibility of random sales data. Defaults to 0.\n - products (list of str): Product list to choose from. Must contain 5 unique strings.\n Defaults to ['Product1', 'Product2', 'Product3', 'Product4', 'Product5'].\n\n Returns:\n - pd.DataFrame: A DataFrame containing sales data with columns 'Product' (string), 'Date' (datetime),\n and 'Sales' (integer). Sales quantity is randomly sampled from range [10, 50].\n\n Requirements:\n - pandas\n - datetime.datetime\n - random\n\n Example:\n >>> sales_data = f_389(1236472051807, random_seed=42)\n >>> type(sales_data)\n \n >>> sales_data.head()\n Product Date Sales\n 0 Product4 2009-03-08 11:27:31.807 50\n 1 Product5 2009-03-08 11:27:31.807 17\n 2 Product1 2009-03-08 11:27:31.807 11\n 3 Product3 2009-03-08 11:27:31.807 27\n 4 Product2 2009-03-08 11:27:31.807 25\n \"\"\"", "canonical_solution": " random.seed(random_seed)\n\n products = list(set(products))\n if len(products) != 5:\n raise ValueError(\"Products must contain 5 unique items\")\n\n start_date = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n end_date = datetime.now()\n if start_date >= end_date:\n raise ValueError(\"Start time must be before current system time\")\n\n date_range = pd.date_range(start_date, end_date, freq=\"D\")\n sales_data = []\n for date in date_range:\n for product in products:\n sales = random.randint(10, 50)\n sales_data.append([product, date, sales])\n\n df = pd.DataFrame(sales_data, columns=[\"Product\", \"Date\", \"Sales\"])\n return df", "test": "import unittest\nfrom datetime import datetime, timedelta\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n sales_data = f_389(1631289600000, random_seed=42)\n self.assertListEqual(list(sales_data.columns), [\"Product\", \"Date\", \"Sales\"])\n self.assertEqual(\n sales_data[\"Date\"].iloc[0], datetime.fromtimestamp(1631289600000 / 1000.0)\n )\n self.assertListEqual(\n sorted(list(sales_data[\"Product\"].unique())),\n [\"Product1\", \"Product2\", \"Product3\", \"Product4\", \"Product5\"],\n )\n def test_case_2(self):\n # Test 3 days ago\n three_days_ago = (datetime.now() - timedelta(days=3)).timestamp() * 1000\n sales_data = f_389(three_days_ago, random_seed=42)\n self.assertListEqual(list(sales_data.columns), [\"Product\", \"Date\", \"Sales\"])\n self.assertEqual(\n sales_data[\"Date\"].iloc[0], datetime.fromtimestamp(three_days_ago / 1000.0)\n )\n self.assertListEqual(\n sorted(list(sales_data[\"Product\"].unique())),\n [\"Product1\", \"Product2\", \"Product3\", \"Product4\", \"Product5\"],\n )\n def test_case_3(self):\n # Test 1 month ago\n one_month_ago = (datetime.now() - timedelta(days=30)).timestamp() * 1000\n sales_data = f_389(one_month_ago, random_seed=42)\n self.assertListEqual(list(sales_data.columns), [\"Product\", \"Date\", \"Sales\"])\n self.assertEqual(\n sales_data[\"Date\"].iloc[0], datetime.fromtimestamp(one_month_ago / 1000.0)\n )\n self.assertListEqual(\n sorted(list(sales_data[\"Product\"].unique())),\n [\"Product1\", \"Product2\", \"Product3\", \"Product4\", \"Product5\"],\n )\n def test_case_4(self):\n # Test custom products\n custom_products = [\"apple\", \"banana\", \"carrot\", \"durian\", \"eggplant\"]\n sales_data = f_389(1577836800000, random_seed=42, products=custom_products)\n self.assertListEqual(list(sales_data.columns), [\"Product\", \"Date\", \"Sales\"])\n self.assertEqual(\n sales_data[\"Date\"].iloc[0], datetime.fromtimestamp(1577836800000 / 1000.0)\n )\n self.assertListEqual(\n sorted(list(sales_data[\"Product\"].unique())), custom_products\n )\n def test_case_5(self):\n # Test handling invalid time - future\n with self.assertRaises(ValueError):\n f_389(int((datetime.now() + timedelta(days=1)).timestamp() * 1000))\n def test_case_6(self):\n # Test handling invalid products - 4 unique items\n with self.assertRaises(ValueError):\n f_389(1631289600000, products=[\"this\", \"is\", \"too\", \"short\"])\n def test_case_7(self):\n # Test handling invalid products - 5 items but with duplicates\n with self.assertRaises(ValueError):\n f_389(1631289600000, products=[\"a\", \"a\", \"b\", \"c\", \"d\"])", "apis": ["random.randint", "random.seed", "datetime.datetime.now", "pandas.date_range", "datetime.datetime.fromtimestamp", "pandas.DataFrame"], "libs": ["pandas", "random", "datetime"], "doc": {"description": ["Generate sales data for five products from a given epoch time up to the current time.", "This function checks input validity, then for each day between the date of the given epoch", "time to the date of the current time, generates random sales data for each of the 5 products."], "note": [], "params": ["epoch_milliseconds (int): Start epoch time in milliseconds. Must be before current system time.", "random_seed (int): Seed for reproducibility of random sales data. Defaults to 0.", "products (list of str): Product list to choose from. Must contain 5 unique strings.", "Defaults to ['Product1', 'Product2', 'Product3', 'Product4', 'Product5']."], "returns": ["pd.DataFrame: A DataFrame containing sales data with columns 'Product' (string), 'Date' (datetime),", "and 'Sales' (integer). Sales quantity is randomly sampled from range [10, 50]."], "reqs": ["pandas", "datetime.datetime", "random"], "raises": [], "example": [">>> sales_data = f_389(1236472051807, random_seed=42)", ">>> type(sales_data)", "", ">>> sales_data.head()", "Product Date Sales", "0 Product4 2009-03-08 11:27:31.807 50", "1 Product5 2009-03-08 11:27:31.807 17", "2 Product1 2009-03-08 11:27:31.807 11", "3 Product3 2009-03-08 11:27:31.807 27", "4 Product2 2009-03-08 11:27:31.807 25"]}} -{"task_id": "f_931", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\ndef f_931(mean=123456.908, std_dev=1.2, save_plots=False):\n \"\"\"\n Generate a random sample from a normal distribution, analyze its skewness and kurtosis,\n and create a histogram and a QQ plot to visualize the distribution.\n\n Parameters:\n - mean (float, optional): Mean of the normal distribution. Defaults to 123456.908.\n - std_dev (float, optional): Standard deviation of the normal distribution. Defaults to 1.2.\n - save_plots (bool, optional): If True, saves the plots to files. Defaults to False.\n\n Returns:\n - float: Skewness of the sample.\n - float: Kurtosis of the sample.\n - list: Paths to the saved plot files, empty if save_plots is False.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - scipy.stats\n\n Example:\n >>> np.random.seed(0)\n >>> skewness, kurtosis, plot_paths = f_931(123456.908, 1.2, True)\n >>> print(f'Skewness: {skewness}, Kurtosis: {kurtosis}, Plots: {plot_paths}')\n Skewness: 0.03385895323538189, Kurtosis: -0.04676632447765128, Plots: ['histogram_plot.png', 'qq_plot.png']\n\n \"\"\"", "canonical_solution": " sample = np.random.normal(mean, std_dev, 1000)\n plot_paths = []\n\n # Plotting histogram\n plt.figure()\n plt.hist(sample, bins=50)\n if save_plots:\n hist_path = \"histogram_plot.png\"\n plt.savefig(hist_path)\n plt.close()\n plot_paths.append(hist_path)\n\n # Plotting QQ diagram\n plt.figure()\n stats.probplot(sample, plot=plt)\n if save_plots:\n qq_path = \"qq_plot.png\"\n plt.savefig(qq_path)\n plt.close()\n plot_paths.append(qq_path)\n\n skewness = stats.skew(sample)\n kurtosis = stats.kurtosis(sample)\n\n return skewness, kurtosis, plot_paths", "test": "import unittest\nimport os\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_931.\"\"\"\n def test_default_parameters(self):\n \"\"\"\n Test f_931 with default parameters.\n \"\"\"\n np.random.seed(0)\n skewness, kurtosis, plot_paths = f_931()\n self.assertAlmostEqual(skewness, 0, delta=0.5)\n self.assertAlmostEqual(kurtosis, 0, delta=0.5)\n self.assertEqual(len(plot_paths), 0)\n def test_save_plots_true(self):\n \"\"\"\n Test f_931 with save_plots set to True.\n \"\"\"\n np.random.seed(1)\n _, _, plot_paths = f_931(save_plots=True)\n self.assertEqual(len(plot_paths), 2)\n for path in plot_paths:\n self.assertTrue(os.path.exists(path))\n os.remove(path) # Clean up: remove created files\n def test_custom_mean_std_dev(self):\n \"\"\"\n Test f_931 with custom mean and standard deviation.\n \"\"\"\n np.random.seed(2)\n mean = 100\n std_dev = 10\n skewness, kurtosis, _ = f_931(mean, std_dev)\n self.assertAlmostEqual(skewness, 0, delta=1)\n self.assertAlmostEqual(kurtosis, 0, delta=1)\n def test_negative_std_dev(self):\n \"\"\"\n Test f_931 with a negative standard deviation.\n \"\"\"\n np.random.seed(3)\n with self.assertRaises(ValueError):\n f_931(std_dev=-1)\n def test_large_sample(self):\n \"\"\"\n Test f_931 with a larger sample size.\n \"\"\"\n np.random.seed(4)\n _, _, plot_paths = f_931(mean=1000, std_dev=50, save_plots=True)\n self.assertEqual(len(plot_paths), 2)\n for path in plot_paths:\n self.assertTrue(os.path.exists(path))\n os.remove(path) # Clean up: remove created files", "apis": ["scipy.stats.probplot", "numpy.random", "scipy.stats.skew", "matplotlib.pyplot.hist", "matplotlib.pyplot.close", "matplotlib.pyplot.savefig", "matplotlib.pyplot.figure", "numpy.random.normal", "scipy.stats.kurtosis"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Generate a random sample from a normal distribution, analyze its skewness and kurtosis,", "and create a histogram and a QQ plot to visualize the distribution."], "note": [], "params": ["mean (float, optional): Mean of the normal distribution. Defaults to 123456.908.", "std_dev (float, optional): Standard deviation of the normal distribution. Defaults to 1.2.", "save_plots (bool, optional): If True, saves the plots to files. Defaults to False."], "returns": ["float: Skewness of the sample.", "float: Kurtosis of the sample.", "list: Paths to the saved plot files, empty if save_plots is False."], "reqs": ["numpy", "matplotlib.pyplot", "scipy.stats"], "raises": [], "example": [">>> np.random.seed(0)", ">>> skewness, kurtosis, plot_paths = f_931(123456.908, 1.2, True)", ">>> print(f'Skewness: {skewness}, Kurtosis: {kurtosis}, Plots: {plot_paths}')", "Skewness: 0.03385895323538189, Kurtosis: -0.04676632447765128, Plots: ['histogram_plot.png', 'qq_plot.png']"]}} -{"task_id": "f_753", "prompt": "from functools import reduce\nimport operator\nimport string\n\ndef f_753(letters):\n \"\"\"\n Calculate the product of the corresponding numbers for a list of uppercase letters, \n where \\\"A\\\" corresponds to 1, \\\"B\\\" to 2, etc.\n \n Parameters:\n letters (list of str): A list of uppercase letters.\n \n Returns:\n int: The product of the numbers corresponding to the input letters.\n \n Requirements:\n - functools.reduce\n - operator\n - string\n \n Examples:\n >>> f_753([\\\"A\\\", \\\"B\\\", \\\"C\\\"])\n 6\n \n >>> f_753([\\\"A\\\", \\\"E\\\", \\\"I\\\"])\n 45\n \n Note:\n The function uses a predefined dictionary to map each uppercase letter to its corresponding number.\n \"\"\"", "canonical_solution": " # Creating a dictionary to map each letter to its corresponding number\n letter_to_number = {letter: i+1 for i, letter in enumerate(string.ascii_uppercase)}\n \n # Convert the letters to numbers\n numbers = [letter_to_number[letter] for letter in letters]\n \n # Calculate the product using functools.reduce and operator.mul\n product = reduce(operator.mul, numbers, 1)\n \n return product", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Input: [\"A\", \"B\", \"C\"]\n # Expected Output: 6 (1 * 2 * 3)\n result = f_753([\"A\", \"B\", \"C\"])\n self.assertEqual(result, 6)\n \n def test_case_2(self):\n # Input: [\"A\", \"E\", \"I\"]\n # Expected Output: 45 (1 * 5 * 9)\n result = f_753([\"A\", \"E\", \"I\"])\n self.assertEqual(result, 45)\n def test_case_3(self):\n # Input: [\"Z\"]\n # Expected Output: 26\n result = f_753([\"Z\"])\n self.assertEqual(result, 26)\n def test_case_4(self):\n # Input: [\"X\", \"Y\", \"Z\"]\n # Expected Output: 24 * 25 * 26\n result = f_753([\"X\", \"Y\", \"Z\"])\n self.assertEqual(result, 24 * 25 * 26)\n \n def test_case_5(self):\n # Input: [\"A\", \"A\", \"A\"]\n # Expected Output: 1 (1 * 1 * 1)\n result = f_753([\"A\", \"A\", \"A\"])\n self.assertEqual(result, 1)", "apis": ["operator.mul", "string.ascii_uppercase", "functools.reduce"], "libs": ["string", "functools", "operator"], "doc": {"description": ["Calculate the product of the corresponding numbers for a list of uppercase letters,", "where \\\"A\\\" corresponds to 1, \\\"B\\\" to 2, etc.", ">>> f_753([\\\"A\\\", \\\"E\\\", \\\"I\\\"])", "45"], "note": ["The function uses a predefined dictionary to map each uppercase letter to its corresponding number."], "params": ["letters (list of str): A list of uppercase letters."], "returns": ["int: The product of the numbers corresponding to the input letters."], "reqs": ["functools.reduce", "operator", "string"], "raises": [], "example": ["Examples:", ">>> f_753([\\\"A\\\", \\\"B\\\", \\\"C\\\"])", "6"]}} -{"task_id": "f_822", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\n\n\ndef f_822(\n feature_array,\n target_array,\n feature_names=[\"f1\", \"f2\", \"f3\", \"f4\", \"f5\"],\n target_name=\"target\",\n seed=None,\n):\n \"\"\"\n Shuffle the columns of a given numpy array and train a Random Forest Classifier on the shuffled data.\n\n Parameters:\n - feature_array (numpy.ndarray): 2D array containing the feature data with shape (n_samples, n_features).\n - target_array (numpy.ndarray): 1D array containing the target data with shape (n_samples,).\n - feature_names (list of str, optional): Names of the features corresponding to the columns in `feature_array`.\n Defaults to ['f1', 'f2', 'f3', 'f4', 'f5'].\n - target_name (str, optional): Name of the target column. Defaults to 'target'.\n - seed (int, optional): Seed for the random number generator to make shuffling reproducible. Defaults to None.\n\n Returns:\n sklearn.ensemble.RandomForestClassifier: A trained Random Forest Classifier on the shuffled feature data.\n\n Requirements:\n - numpy\n - pandas\n - sklearn\n\n Examples:\n >>> feature_array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n >>> target_array = np.array([0, 1])\n >>> clf = f_822(feature_array, target_array)\n >>> type(clf)\n \n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n\n shuffled_array = feature_array.copy()\n np.random.shuffle(shuffled_array.T)\n\n df = pd.DataFrame(shuffled_array, columns=feature_names)\n df[target_name] = target_array\n\n clf = RandomForestClassifier()\n clf.fit(df[feature_names], df[target_name])\n\n return clf", "test": "import unittest\nimport numpy as np\nfrom sklearn.ensemble import RandomForestClassifier\nimport warnings\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n target = np.array([0, 1])\n clf = f_822(array, target)\n self.assertIsInstance(clf, RandomForestClassifier)\n self.assertTrue(len(clf.feature_importances_) > 0)\n self.assertEqual(set(np.unique(target)), set(clf.classes_))\n with warnings.catch_warnings():\n # Temporarily suppress warning - clf prefers named array\n warnings.simplefilter(\"ignore\", category=UserWarning)\n predictions = clf.predict(array)\n np.testing.assert_array_equal(\n predictions,\n target,\n \"The model's predictions do not match the expected target values.\",\n )\n def test_case_2(self):\n # Test identical features\n array = np.ones((10, 5))\n target = np.zeros(10)\n clf = f_822(array, target)\n self.assertTrue(len(clf.feature_importances_) > 0)\n def test_case_3(self):\n # Test all unique targets\n array = np.array([[i] * 5 for i in range(10)])\n target = np.arange(10)\n clf = f_822(array, target)\n self.assertEqual(len(np.unique(target)), len(clf.classes_))\n def test_case_4(self):\n # Test random seed reproducibility\n np.random.seed(0)\n array = np.random.rand(10, 5)\n target = np.random.randint(0, 2, 10)\n clf1 = f_822(array, target, seed=42)\n clf2 = f_822(array, target, seed=42)\n self.assertEqual(\n clf1.feature_importances_.tolist(), clf2.feature_importances_.tolist()\n )\n def test_case_5(self):\n # Test negative features\n array = np.array([[-1, -2, -3, -4, -5], [-6, -7, -8, -9, -10]])\n target = np.array([0, 1])\n clf = f_822(array, target)\n self.assertTrue(len(clf.feature_importances_) > 0)\n def test_case_6(self):\n # Test single feature array\n array = np.arange(10).reshape(-1, 1)\n target = np.array([0, 1] * 5)\n feature_names = [\"f1\"]\n clf = f_822(array, target, feature_names)\n self.assertTrue(len(clf.feature_importances_) > 0)\n def test_case_7(self):\n # Test exception handling for incompatible shapes among arrays\n array = np.array([[1, 2, 3], [4, 5, 6]])\n target = np.array([0, 1, 2])\n with self.assertRaises(ValueError):\n f_822(array, target)\n def test_case_8(self):\n # Test exception handling for incompatible feature_names vs array shape\n array = np.array([[1, 2, 3], [4, 5, 6]]) # 2x3 array\n target = np.array([0, 1])\n incorrect_feature_names = [\"f1\", \"f2\"] # Only 2 names for a 3-column array\n with self.assertRaises(ValueError):\n f_822(array, target, feature_names=incorrect_feature_names)\n def test_case_9(self):\n # Test custom feature names\n array = np.array([[7, 8], [9, 10]])\n target = np.array([0, 1])\n custom_feature_names = [\"custom1\", \"custom2\"]\n clf = f_822(array, target, feature_names=custom_feature_names)\n self.assertEqual(clf.feature_importances_.size, len(custom_feature_names))\n def test_case_10(self):\n # Test custom target name\n array = np.array([[11, 12, 13, 14, 15], [16, 17, 18, 19, 20]])\n target = np.array([1, 0])\n custom_target_name = \"custom_target\"\n clf = f_822(array, target, target_name=custom_target_name)\n # Check if the model was trained successfully\n self.assertTrue(len(clf.feature_importances_) > 0)", "apis": ["numpy.random", "numpy.random.seed", "sklearn.ensemble.RandomForestClassifier", "numpy.random.shuffle", "pandas.DataFrame"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Shuffle the columns of a given numpy array and train a Random Forest Classifier on the shuffled data."], "note": [], "params": ["feature_array (numpy.ndarray): 2D array containing the feature data with shape (n_samples, n_features).", "target_array (numpy.ndarray): 1D array containing the target data with shape (n_samples,).", "feature_names (list of str, optional): Names of the features corresponding to the columns in `feature_array`.", "Defaults to ['f1', 'f2', 'f3', 'f4', 'f5'].", "target_name (str, optional): Name of the target column. Defaults to 'target'.", "seed (int, optional): Seed for the random number generator to make shuffling reproducible. Defaults to None."], "returns": ["sklearn.ensemble.RandomForestClassifier: A trained Random Forest Classifier on the shuffled feature data."], "reqs": ["numpy", "pandas", "sklearn"], "raises": [], "example": ["Examples:", ">>> feature_array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])", ">>> target_array = np.array([0, 1])", ">>> clf = f_822(feature_array, target_array)", ">>> type(clf)", ""]}} -{"task_id": "f_882", "prompt": "from datetime import datetime\nimport json\n\nSERVER_ADDRESS = \"localhost\"\nBUFFER_SIZE = 1024\n\n\ndef f_882(client_socket):\n \"\"\"\n Responds to a client's request by sending a JSON-formatted message containing\n the current server time and a greeting.\n\n Parameters:\n - client_socket (socket.socket): The client socket from which the request is received.\n\n Requirements:\n - datetime.datetime\n - json\n\n Returns:\n - None\n\n Example:\n >>> import socket\n >>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n >>> server_socket.bind((SERVER_ADDRESS, 0)) # Bind to a free port\n >>> server_socket.bind((SERVER_ADDRESS, 8080))\n >>> server_socket.listen(1)\n >>> try:\n ... client_socket, _ = server_socket.accept()\n ... f_882(client_socket)\n ... finally:\n ... server_socket.close()\n \"\"\"", "canonical_solution": " response_data = {\"message\": \"Hello\", \"time\": str(datetime.now())}\n response = json.dumps(response_data) + \"\\n\"\n client_socket.send(response.encode(\"utf-8\"))\n client_socket.close()", "test": "import unittest\nimport socket\nimport threading\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_882.\"\"\"\n def setUp(self):\n \"\"\"Set up a server socket for testing.\"\"\"\n self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n self.server_socket.bind((SERVER_ADDRESS, 0)) # Bind to a free port\n self.server_socket.listen(1)\n self.port = self.server_socket.getsockname()[1]\n def tearDown(self):\n \"\"\"Close the server socket after each test.\"\"\"\n self.server_socket.close()\n def client_thread_function(self, responses, request_message):\n \"\"\"Function to simulate a client sending a request and receiving a response.\"\"\"\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client_socket:\n client_socket.connect((SERVER_ADDRESS, self.port))\n client_socket.send(request_message + b\"\\n\") # Append end-of-message marker\n response = client_socket.recv(BUFFER_SIZE).decode()\n responses.append(response)\n def test_response_contains_greeting(self):\n \"\"\"Test if the response from the server contains a greeting.\"\"\"\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function, args=(responses, b\"Test request\")\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n # Ensure that responses is not empty before accessing it\n self.assertTrue(responses) # Check that responses is not empty\n self.assertIn(\"Hello\", responses[0])\n def test_handle_large_request(self):\n \"\"\"\n Test how the function handles a request larger than the buffer size.\n \"\"\"\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function,\n args=(responses, b\"a\" * (BUFFER_SIZE + 1)),\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n # Expecting a normal response despite a large request\n self.assertIn(\"Hello\", responses[0])\n def test_response_format(self):\n \"\"\"\n Test if the response format from the server is correct.\n \"\"\"\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function, args=(responses, b\"Format request\")\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n response_data = json.loads(responses[0])\n self.assertIn(\"time\", response_data)\n def test_handle_special_characters_request(self):\n \"\"\"\n Test how the function handles a request with special characters.\n \"\"\"\n special_request = b\"!@#$%^&*()_+\"\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function, args=(responses, special_request)\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n # Expecting a normal response despite a request with special characters\n self.assertIn(\"Hello\", responses[0])\n def test_handle_json_request(self):\n \"\"\"\n Test how the function handles a JSON-formatted request.\n \"\"\"\n json_request = {\"request\": \"time\"}\n json_request_encoded = json.dumps(json_request).encode(\"utf-8\")\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function, args=(responses, json_request_encoded)\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n # Expecting a normal response despite the JSON request\n self.assertIn(\"Hello\", responses[0])", "apis": ["json.dumps", "datetime.datetime.now"], "libs": ["json", "datetime"], "doc": {"description": ["Responds to a client's request by sending a JSON-formatted message containing", "the current server time and a greeting."], "note": [], "params": ["client_socket (socket.socket): The client socket from which the request is received."], "returns": ["None"], "reqs": ["datetime.datetime", "json"], "raises": [], "example": [">>> import socket", ">>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)", ">>> server_socket.bind((SERVER_ADDRESS, 0)) # Bind to a free port", ">>> server_socket.bind((SERVER_ADDRESS, 8080))", ">>> server_socket.listen(1)", ">>> try:", "... client_socket, _ = server_socket.accept()", "... f_882(client_socket)", "... finally:", "... server_socket.close()"]}} -{"task_id": "f_405", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.decomposition import PCA\n\n\ndef f_405(array: list, random_seed: int = 42) -> (pd.DataFrame, np.ndarray):\n \"\"\"\n Converts a 2D list into a pandas DataFrame and applies PCA for dimensionality reduction.\n\n This function creates a DataFrame from the provided 2D list and then applies PCA to reduce the dataset\n to its two main components. The function uses a fixed random seed to ensure reproducibility.\n\n Parameters:\n - array (list of list of int): A 2D list representing data rows and columns.\n - random_seed (int, optional): The seed for the random number generator. Default is 42.\n\n Returns:\n - pd.DataFrame: The original data in DataFrame format.\n - np.ndarray: The data after PCA transformation.\n\n Requirements:\n - pandas\n - numpy\n - sklearn.decomposition.PCA\n\n Examples:\n >>> data = [[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15]]\n >>> df, transformed = f_405(data)\n >>> print(df)\n 0 1 2 3 4\n 0 1 2 3 4 5\n 1 6 7 8 9 10\n 2 11 12 13 14 15\n >>> print(transformed)\n [[ 1.11803399e+01 8.88178420e-16]\n [-0.00000000e+00 -0.00000000e+00]\n [-1.11803399e+01 8.88178420e-16]]\n \"\"\"", "canonical_solution": " df = pd.DataFrame(array)\n\n pca = PCA(n_components=2, random_state=random_seed)\n transformed_data = pca.fit_transform(df)\n\n return df, transformed_data", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic 2-row dataset\n data = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (2, 2))\n def test_case_2(self):\n # Test basic 3-row dataset\n data = [[10, 20, 30, 40, 50], [60, 70, 80, 90, 100], [110, 120, 130, 140, 150]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (3, 2))\n def test_case_3(self):\n # Test mix of positive, negative, zero values\n data = [[-1, -2, -3, -4, -5], [5, 6, 7, 8, 9], [0, 0, 0, 0, 0]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (3, 2))\n def test_case_4(self):\n # Test 4-row dataset with incremental pattern\n data = [\n [5, 15, 25, 35, 45],\n [55, 65, 75, 85, 95],\n [105, 115, 125, 135, 145],\n [155, 165, 175, 185, 195],\n ]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (4, 2))\n def test_case_5(self):\n # Test uniform rows\n data = [[10, 10, 10, 10, 10], [20, 20, 20, 20, 20], [30, 30, 30, 30, 30]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (3, 2))\n def test_case_6(self):\n # Test single row (should fail since it's < n_components)\n with self.assertRaises(ValueError):\n data = [[1, 2, 3, 4, 5]]\n f_405(data)\n def test_case_7(self):\n # Test large numbers\n data = [[1000000000, 2000000000], [-1000000000, -2000000000]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (2, 2))\n def test_case_8(self):\n # Test correctness of PCA\n data = [[2, 3], [3, 4], [5, 6]]\n _, transformed_data = f_405(data)\n # Using the sklearn PCA output as the expected transformation\n expected_transformation = np.array(\n [\n [-1.88561808e00, 1.93816421e-16],\n [-4.71404521e-01, 3.32511118e-16],\n [2.35702260e00, 2.21555360e-16],\n ]\n )\n np.testing.assert_almost_equal(\n transformed_data, expected_transformation, decimal=5\n )\n def test_case_9(self):\n # Test floats\n data = [[1.5, 2.5], [3.5, 4.5], [5.5, 6.5]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (3, 2))", "apis": ["numpy.ndarray", "pandas.DataFrame", "sklearn.decomposition.PCA"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Converts a 2D list into a pandas DataFrame and applies PCA for dimensionality reduction.", "This function creates a DataFrame from the provided 2D list and then applies PCA to reduce the dataset", "to its two main components. The function uses a fixed random seed to ensure reproducibility."], "note": [], "params": ["array (list of list of int): A 2D list representing data rows and columns.", "random_seed (int, optional): The seed for the random number generator. Default is 42."], "returns": ["pd.DataFrame: The original data in DataFrame format.", "np.ndarray: The data after PCA transformation."], "reqs": ["pandas", "numpy", "sklearn.decomposition.PCA"], "raises": [], "example": ["Examples:", ">>> data = [[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15]]", ">>> df, transformed = f_405(data)", ">>> print(df)", "0 1 2 3 4", "0 1 2 3 4 5", "1 6 7 8 9 10", "2 11 12 13 14 15", ">>> print(transformed)", "[[ 1.11803399e+01 8.88178420e-16]", "[-0.00000000e+00 -0.00000000e+00]", "[-1.11803399e+01 8.88178420e-16]]"]}} -{"task_id": "f_403", "prompt": "import pandas as pd\nimport seaborn as sns\n\n\ndef f_403(array):\n \"\"\"Generates a DataFrame and heatmap from a 2D list.\n\n This function takes a 2D list and returns a pandas DataFrame and a seaborn heatmap\n representing the correlation matrix of the DataFrame. Assumes sublists of length 5.\n Also assumes DataFrame columns: 'A', 'B', 'C', 'D', 'E'.\n\n Parameters:\n - array (list of list of int): 2D list with sublists of length 5. Must not be empty.\n\n Returns:\n - DataFrame: Constructed from the input 2D list.\n - heatmap: Seaborn heatmap of the DataFrame's correlation matrix.\n\n Requirements:\n - pandas\n - seaborn\n\n Example:\n >>> df, ax = f_403([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])\n >>> df\n A B C D E\n 0 1 2 3 4 5\n 1 5 4 3 2 1\n >>> ax\n \n \"\"\"", "canonical_solution": " COLUMNS = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n\n if not array or any(len(sublist) != 5 for sublist in array):\n raise ValueError(\"array must be non-empty and all sublists must have a length of 5.\")\n\n df = pd.DataFrame(array, columns=COLUMNS)\n heatmap = sns.heatmap(df.corr(), annot=True)\n return df, heatmap", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport random\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n random.seed(42)\n cls.mock_data = [[random.randint(1, 100) for _ in range(5)] for _ in range(5)]\n def test_case_1(self):\n # Test dataframe creation with valid input\n df, _ = f_403(self.mock_data)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.shape, (5, 5))\n def test_case_2(self):\n # Test heatmap creation with valid input\n _, heatmap = f_403(self.mock_data)\n self.assertIsNotNone(heatmap)\n def test_case_3(self):\n # Test correlation accuracy with known data\n correlated_data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]\n df, _ = f_403(correlated_data)\n corr_matrix = df.corr()\n np.testing.assert_array_almost_equal(\n corr_matrix, np.corrcoef(correlated_data, rowvar=False)\n )\n def test_case_4(self):\n # Test handling of non-numeric data\n with self.assertRaises(ValueError):\n f_403([[\"a\", \"b\", \"c\", \"d\", \"e\"], [1, 2, 3, 4, 5]])\n def test_case_5(self):\n # Test with empty list\n with self.assertRaises(ValueError):\n f_403([])\n def test_case_6(self):\n # Test with single sublist\n single_sublist = [[1, 2, 3, 4, 5]]\n df, _ = f_403(single_sublist)\n self.assertEqual(df.shape, (1, 5))\n def test_case_7(self):\n # Test handling sublists of varying lengths\n with self.assertRaises(ValueError):\n f_403([[1, 2, 3], [4, 5, 6, 7, 8]])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "seaborn.heatmap"], "libs": ["pandas", "seaborn"], "doc": {"description": ["Generates a DataFrame and heatmap from a 2D list.", "This function takes a 2D list and returns a pandas DataFrame and a seaborn heatmap", "representing the correlation matrix of the DataFrame. Assumes sublists of length 5.", "Also assumes DataFrame columns: 'A', 'B', 'C', 'D', 'E'."], "note": [], "params": ["array (list of list of int): 2D list with sublists of length 5. Must not be empty."], "returns": ["DataFrame: Constructed from the input 2D list.", "heatmap: Seaborn heatmap of the DataFrame's correlation matrix."], "reqs": ["pandas", "seaborn"], "raises": [], "example": [">>> df, ax = f_403([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])", ">>> df", "A B C D E", "0 1 2 3 4 5", "1 5 4 3 2 1", ">>> ax", ""]}} -{"task_id": "f_378", "prompt": "import re\nimport random\nimport pandas as pd\n\n\ndef f_378(data_list, seed=None):\n \"\"\"\n Shuffle the substrings within each string in a given list.\n\n This function takes a list of comma-separated strings and splits each into substrings.\n It extracts substrings based on commas, removing leading and trailing whitespaces\n from each. Then, it shuffles these processed substrings within each string, and\n returns a pandas DataFrame with two columns: \"Original String\" and \"Shuffled String\".\n\n Parameters:\n data_list (list): The list of comma-separated strings.\n seed (int, optional): Seed for the random number generator. Default is None.\n\n Returns:\n DataFrame: A pandas DataFrame with columns 'Original String' and 'Shuffled String'.\n\n Requirements:\n - pandas\n - random\n - re\n\n Example:\n >>> f_378(['lamp, bag, mirror', 'table, chair'], seed=42)\n Original String Shuffled String\n 0 lamp, bag, mirror bag, lamp, mirror\n 1 table, chair chair, table\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n df = pd.DataFrame(data_list, columns=[\"Original String\"])\n\n shuffled_strings = []\n for s in data_list:\n substrings = re.split(\"\\s*,\\s*\", s)\n random.shuffle(substrings)\n shuffled_s = \", \".join(substrings)\n shuffled_strings.append(shuffled_s)\n\n df[\"Shuffled String\"] = shuffled_strings\n\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n input_data = [\"lamp, bag, mirror\", \"table, chair\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"lamp, bag, mirror\")\n self.assertEqual(output_df[\"Original String\"].iloc[1], \"table, chair\")\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[0].split(\", \")), 3)\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[1].split(\", \")), 2)\n def test_case_2(self):\n # Test single character substrings\n input_data = [\"A, B, C, D\", \"E, F, G\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"A, B, C, D\")\n self.assertEqual(output_df[\"Original String\"].iloc[1], \"E, F, G\")\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[0].split(\", \")), 4)\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[1].split(\", \")), 3)\n def test_case_3(self):\n # Test single-item list\n input_data = [\"word1, word2\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"word1, word2\")\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[0].split(\", \")), 2)\n def test_case_4(self):\n # Tests shuffling with an empty string\n input_data = [\"\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"\")\n self.assertEqual(output_df[\"Shuffled String\"].iloc[0], \"\")\n def test_case_5(self):\n # Test shuffling single substring (no shuffling)\n input_data = [\"single\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"single\")\n self.assertEqual(output_df[\"Shuffled String\"].iloc[0], \"single\")\n def test_case_6(self):\n # Testing the effect of a specific random seed to ensure reproducibility\n input_data = [\"a, b, c, d\"]\n output_df1 = f_378(input_data, seed=42)\n output_df2 = f_378(input_data, seed=42)\n self.assertEqual(\n output_df1[\"Shuffled String\"].iloc[0], output_df2[\"Shuffled String\"].iloc[0]\n )\n def test_case_7(self):\n # Tests shuffling with varying spaces around commas\n input_data = [\"one,two, three\"]\n corrected_expected_shuffled = \"two, one, three\"\n output_df = f_378(input_data, seed=42)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"one,two, three\")\n self.assertEqual(\n output_df[\"Shuffled String\"].iloc[0], corrected_expected_shuffled\n )", "apis": ["re.split", "random.shuffle", "pandas.DataFrame", "random.seed"], "libs": ["pandas", "re", "random"], "doc": {"description": ["Shuffle the substrings within each string in a given list.", "This function takes a list of comma-separated strings and splits each into substrings.", "It extracts substrings based on commas, removing leading and trailing whitespaces", "from each. Then, it shuffles these processed substrings within each string, and", "returns a pandas DataFrame with two columns: \"Original String\" and \"Shuffled String\"."], "note": [], "params": ["data_list (list): The list of comma-separated strings.", "seed (int, optional): Seed for the random number generator. Default is None."], "returns": ["DataFrame: A pandas DataFrame with columns 'Original String' and 'Shuffled String'."], "reqs": ["pandas", "random", "re"], "raises": [], "example": [">>> f_378(['lamp, bag, mirror', 'table, chair'], seed=42)", "Original String Shuffled String", "0 lamp, bag, mirror bag, lamp, mirror", "1 table, chair chair, table"]}} +{"task_id": "f_805", "prompt": "import os\nfrom pathlib import Path\nimport glob\nimport shutil\n\n\ndef f_805(source_directory: str, target_directory: str):\n \"\"\"\n Moves files with specific extensions from a source directory to a target directory,\n handling naming conflicts by renaming duplicates.\n\n Parameters:\n - source_directory (str): The absolute or relative path of the source directory.\n - target_directory (str): The absolute or relative path of the target directory.\n This function will create it if it does not exist.\n\n Returns:\n - int: The number of files successfully moved.\n\n Raises:\n - FileNotFoundError: If source_directory does not exist.\n\n Requirements:\n - os\n - pathlib\n - glob\n - shutil\n\n Notes:\n - This function scans the source directory recursively to find files.\n - Files are filtered by the extensions: \".txt\", \".docx\", \".xlsx\", \".csv\".\n - Renaming of files due to naming conflicts follows the pattern '-n.'.\n\n Examples:\n >>> f_805('./source_folder', './target_folder')\n 3\n >>> f_805('./empty_folder', './target_folder')\n 0\n \"\"\"", "canonical_solution": " moved_files = 0\n\n if not os.path.exists(source_directory):\n raise FileNotFoundError(\"source_directory must exist.\")\n\n if not os.path.exists(target_directory):\n os.makedirs(target_directory)\n\n for extension in [\".txt\", \".docx\", \".xlsx\", \".csv\"]:\n filepaths = glob.glob(\n os.path.join(source_directory, \"**\", \"*\" + extension), recursive=True\n )\n for filepath in filepaths:\n filename = Path(filepath).name\n stem = Path(filepath).stem\n target_filepath = os.path.join(target_directory, filename)\n\n count = 1\n while os.path.exists(target_filepath):\n new_filename = f\"{stem}-{count}{extension}\"\n target_filepath = os.path.join(target_directory, new_filename)\n count += 1\n\n shutil.move(filepath, target_filepath)\n moved_files += 1\n\n return moved_files", "test": "import unittest\nimport tempfile\nfrom pathlib import Path\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.valid_extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n def test_case_1(self):\n # Test with an empty source directory\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result, 0, \"Should return 0 for an empty source directory.\"\n )\n def test_case_2(self):\n # Test with a source directory containing only files with no extensions\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n for i in range(3):\n Path(f\"{source_dir}/file_{i}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result, 0, \"Should return 0 for files with non-matching extensions.\"\n )\n def test_case_3(self):\n # Test with a source directory containing files with a mix of extensions\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n extensions = self.valid_extensions + [\".pdf\", \".jpg\"]\n for i, ext in enumerate(extensions):\n Path(f\"{source_dir}/file_{i}{ext}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertTrue(result == len(self.valid_extensions))\n def test_case_4(self):\n # Test with a source directory containing files with all matching extensions\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n for i, ext in enumerate(self.valid_extensions):\n Path(f\"{source_dir}/file_{i}{ext}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result, 4, \"Should return 4 for all files with matching extensions.\"\n )\n def test_case_5(self):\n # Test with a source directory containing nested directories with files\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n Path(f\"{source_dir}/subdir1\").mkdir()\n Path(f\"{source_dir}/subdir1/subdir2\").mkdir()\n for i, ext in enumerate(extensions):\n Path(f\"{source_dir}/file_{i}{ext}\").touch()\n Path(f\"{source_dir}/subdir1/file_{i}{ext}\").touch()\n Path(f\"{source_dir}/subdir1/subdir2/file_{i}{ext}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result,\n 12,\n \"Should return 12 for all files in nested directories with matching extensions.\",\n )\n def test_case_6(self):\n # Test files with the same name in different subdirectories of the source directory\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n Path(f\"{source_dir}/subdir1\").mkdir()\n Path(f\"{source_dir}/subdir2\").mkdir()\n extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n # Create files with the same name in different subdirectories\n for ext in extensions:\n (Path(f\"{source_dir}/subdir1\") / f\"file{ext}\").touch()\n (Path(f\"{source_dir}/subdir2\") / f\"file{ext}\").touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(\n result,\n 8,\n \"Should correctly move files with the same name from different source directories.\",\n )\n def test_case_7(self):\n # Test handling of invalid path inputs\n source_dir = \"/path/does/not/exist\"\n with tempfile.TemporaryDirectory() as target_dir:\n with self.assertRaises(FileNotFoundError):\n f_805(source_dir, target_dir)\n def test_case_8(self):\n # Test file renaming when handling duplicate files\n with tempfile.TemporaryDirectory() as source_dir, tempfile.TemporaryDirectory() as target_dir:\n extensions = self.valid_extensions\n for i, ext in enumerate(extensions):\n filename = f\"file_{i}{ext}\"\n # Create duplicate files in the source directory\n Path(os.path.join(source_dir, filename)).touch()\n # Create expected duplicate files in the target directory to force renaming\n Path(os.path.join(target_dir, filename)).touch()\n result = f_805(source_dir, target_dir)\n self.assertEqual(result, len(extensions), \"Should have moved all files.\")\n # Check if files were renamed correctly to avoid overwriting\n expected_files = [f\"file_{i}-1{ext}\" for i, ext in enumerate(extensions)]\n actual_files = [Path(f).name for f in glob.glob(f\"{target_dir}/*\")]\n for expected_file in expected_files:\n self.assertIn(\n expected_file,\n actual_files,\n f\"{expected_file} was not found in target directory.\",\n )", "apis": ["os.makedirs", "glob.glob", "os.path", "pathlib.Path", "os.path.join", "os.path.exists", "shutil.move"], "libs": ["os", "glob", "shutil", "pathlib"], "doc": {"description": ["Moves files with specific extensions from a source directory to a target directory,", "handling naming conflicts by renaming duplicates.", "Notes:", "- This function scans the source directory recursively to find files.", "- Files are filtered by the extensions: \".txt\", \".docx\", \".xlsx\", \".csv\".", "- Renaming of files due to naming conflicts follows the pattern '-n.'."], "note": [], "params": ["source_directory (str): The absolute or relative path of the source directory.", "target_directory (str): The absolute or relative path of the target directory.", "This function will create it if it does not exist."], "returns": ["int: The number of files successfully moved."], "reqs": ["os", "pathlib", "glob", "shutil"], "raises": ["FileNotFoundError: If source_directory does not exist."], "example": ["Examples:", ">>> f_805('./source_folder', './target_folder')", "3", ">>> f_805('./empty_folder', './target_folder')", "0"]}} +{"task_id": "f_811", "prompt": "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_811(df):\n \"\"\"\n Creates and return a heatmap of the cumulative sum of each column in a pandas DataFrame.\n\n Parameters:\n - df (pandas.DataFrame): A DataFrame with numerical values.\n\n Returns:\n - matplotlib.axes._subplots.Axes: The Axes object of the Seaborn heatmap.\n\n Raises:\n - ValueError: If the DataFrame is empty or if no numeric columns are present.\n\n Requirements:\n - pandas\n - matplotlib\n - seaborn\n\n Notes:\n - Only numeric columns are considered for the heatmap. Non-numeric columns are ignored.\n\n Example:\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n >>> ax = f_811(df)\n \"\"\"", "canonical_solution": " numeric_df = df.select_dtypes(include=[\"number\"])\n if numeric_df.empty:\n raise ValueError(\"No numeric columns present\")\n\n df_cumsum = numeric_df.cumsum()\n ax = sns.heatmap(df_cumsum)\n return ax", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n plt.close(\"all\")\n def test_cumsum_correctness(self):\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n ax = f_811(df)\n result_cumsum = df.cumsum().values.flatten()\n heatmap_data = ax.collections[0].get_array().data.flatten()\n np.testing.assert_array_equal(\n result_cumsum, heatmap_data, \"Cumulative sum calculation is incorrect\"\n )\n def test_non_numeric_columns_ignored(self):\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [\"one\", \"two\", \"three\"]})\n ax = f_811(df)\n self.assertIsInstance(\n ax, plt.Axes, \"The result should be a matplotlib Axes object\"\n )\n self.assertEqual(\n len(ax.get_xticklabels()), 1, \"Non-numeric columns should be ignored\"\n )\n def test_with_positive_numbers(self):\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n result = f_811(df)\n self.assertIsInstance(\n result, plt.Axes, \"The result should be a matplotlib Axes object\"\n )\n def test_with_negative_numbers(self):\n df = pd.DataFrame({\"A\": [-1, -2, -3], \"B\": [-4, -5, -6]})\n result = f_811(df)\n self.assertIsInstance(\n result, plt.Axes, \"The result should be a matplotlib Axes object\"\n )\n def test_with_mixed_numbers(self):\n df = pd.DataFrame({\"A\": [1, -2, 3], \"B\": [-4, 5, -6]})\n result = f_811(df)\n self.assertIsInstance(\n result, plt.Axes, \"The result should be a matplotlib Axes object\"\n )\n def test_with_zeroes(self):\n df = pd.DataFrame({\"A\": [0, 0, 0], \"B\": [0, 0, 0]})\n result = f_811(df)\n self.assertIsInstance(\n result, plt.Axes, \"The result should be a matplotlib Axes object\"\n )\n def test_with_empty_dataframe(self):\n df = pd.DataFrame({\"A\": [], \"B\": []})\n with self.assertRaises(ValueError):\n f_811(df)\n def test_no_numeric_columns(self):\n df = pd.DataFrame({\"A\": [\"one\", \"two\", \"three\"], \"B\": [\"four\", \"five\", \"six\"]})\n with self.assertRaises(ValueError):\n f_811(df)", "apis": ["seaborn.heatmap"], "libs": ["seaborn"], "doc": {"description": ["Creates and return a heatmap of the cumulative sum of each column in a pandas DataFrame.", "Notes:", "- Only numeric columns are considered for the heatmap. Non-numeric columns are ignored."], "note": [], "params": ["df (pandas.DataFrame): A DataFrame with numerical values."], "returns": ["matplotlib.axes._subplots.Axes: The Axes object of the Seaborn heatmap."], "reqs": ["pandas", "matplotlib", "seaborn"], "raises": ["ValueError: If the DataFrame is empty or if no numeric columns are present."], "example": [">>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})", ">>> ax = f_811(df)"]}} +{"task_id": "f_875", "prompt": "import matplotlib.pyplot as plt\nimport random\nimport string\nimport pandas as pd\nimport seaborn as sns\n\n# Constants\nLETTERS = list(string.ascii_lowercase)\n\n\ndef f_875(rows=1000, string_length=3):\n \"\"\"\n Generate a dataframe of random strings and create a heatmap showing the correlation\n in the frequency of each letter in these strings.\n\n This function generates a specified number of random strings, each of a given length,\n and calculates the frequency of each letter in these strings. A heatmap of the \n correlation matrix is then displayed, showing the co-occurrence frequencies of different \n letters within these strings.\n\n If the number of rows specified is zero, the function will print a message indicating\n that no data is available to generate the heatmap and will return None. Otherwise, \n it processes the DataFrame to convert the generated strings into a one-hot encoded format\n and then sums up these encodings to calculate the frequency of each letter.\n\n Parameters:\n - rows (int, optional): Number of random strings to generate. Must be non-negative. \n Default is 1000. If set to 0, the function returns None after printing a message.\n - string_length (int, optional): Length of each random string. Must be non-negative. \n Default is 3. A value of 0 results in the generation of empty strings.\n\n Returns:\n - matplotlib.axes._subplots.Axes or None: A seaborn heatmap plot object if \n data is generated; otherwise, None.\n\n Requirements:\n - random\n - string\n - pandas\n - seaborn\n - matplotlib\n\n Note\n - If no strings are generated (e.g., rows = 0), the \n DataFrame will be empty. In this case, the function prints a message \"No data to generate heatmap.\" and returns None.\n - If the DataFrame is not empty, each string is split into its \n constituent letters, converted into one-hot encoded format, and then the frequency \n of each letter is calculated by summing these encodings.\n \n Example:\n >>> ax = f_875(1000, 3)\n >>> ax.get_xlim()\n (0.0, 26.0)\n \"\"\"", "canonical_solution": "\n # Generate random strings\n data = [\"\".join(random.choices(LETTERS, k=string_length)) for _ in range(rows)]\n\n # Create a DataFrame and compute letter frequency\n df = pd.DataFrame({\"String\": data})\n\n # Check if the DataFrame is empty\n if df.empty:\n print(\"No data to generate heatmap.\")\n return None\n\n df = pd.get_dummies(df[\"String\"].apply(list).explode()).groupby(level=0).sum()\n\n # Calculate the correlation matrix\n corr = df.corr()\n\n # Create and return the heatmap\n ax = sns.heatmap(corr, annot=True, fmt=\".2f\")\n plt.close() # Close the plot to prevent it from showing during function call\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_875.\"\"\"\n def test_default_parameters(self):\n \"\"\"\n Test f_875 with default parameters (rows=1000, string_length=3).\n Verifies if the function returns a matplotlib Axes object.\n \"\"\"\n random.seed(0)\n result = f_875()\n self.assertIsInstance(result, plt.Axes)\n def test_custom_rows(self):\n \"\"\"\n Test f_875 with a custom number of rows.\n Verifies if the function still returns a matplotlib Axes object.\n \"\"\"\n random.seed(1)\n result = f_875(rows=500)\n self.assertIsInstance(result, plt.Axes)\n def test_custom_string_length(self):\n \"\"\"\n Test f_875 with a custom string length.\n Verifies if the function still returns a matplotlib Axes object.\n \"\"\"\n random.seed(2)\n result = f_875(string_length=5)\n self.assertIsInstance(result, plt.Axes)\n def test_large_dataset(self):\n \"\"\"\n Test f_875 with a large dataset.\n Verifies if the function can handle a large number of rows without errors.\n \"\"\"\n random.seed(3)\n result = f_875(rows=10000, string_length=3)\n self.assertIsInstance(result, plt.Axes)\n def test_zero_rows(self):\n \"\"\"\n Test f_875 with zero rows.\n Verifies if the function handles edge case of zero rows by returning None.\n \"\"\"\n random.seed(4)\n result = f_875(rows=0)\n self.assertIsNone(result, \"Function should return None for zero rows.\")\n def tearDown(self):\n plt.close()", "apis": ["seaborn.heatmap", "pandas.DataFrame", "string.ascii_lowercase", "matplotlib.pyplot.close", "pandas.get_dummies", "random.choices"], "libs": ["string", "seaborn", "random", "pandas", "matplotlib"], "doc": {"description": ["Generate a dataframe of random strings and create a heatmap showing the correlation", "in the frequency of each letter in these strings.", "This function generates a specified number of random strings, each of a given length,", "and calculates the frequency of each letter in these strings. A heatmap of the", "correlation matrix is then displayed, showing the co-occurrence frequencies of different", "letters within these strings.", "If the number of rows specified is zero, the function will print a message indicating", "that no data is available to generate the heatmap and will return None. Otherwise,", "it processes the DataFrame to convert the generated strings into a one-hot encoded format", "and then sums up these encodings to calculate the frequency of each letter.", "Note", "- If no strings are generated (e.g., rows = 0), the", "DataFrame will be empty. In this case, the function prints a message \"No data to generate heatmap.\" and returns None.", "- If the DataFrame is not empty, each string is split into its", "constituent letters, converted into one-hot encoded format, and then the frequency", "of each letter is calculated by summing these encodings."], "note": [], "params": ["rows (int, optional): Number of random strings to generate. Must be non-negative.", "Default is 1000. If set to 0, the function returns None after printing a message.", "string_length (int, optional): Length of each random string. Must be non-negative.", "Default is 3. A value of 0 results in the generation of empty strings."], "returns": ["matplotlib.axes._subplots.Axes or None: A seaborn heatmap plot object if", "data is generated; otherwise, None."], "reqs": ["random", "string", "pandas", "seaborn", "matplotlib"], "raises": [], "example": [">>> ax = f_875(1000, 3)", ">>> ax.get_xlim()", "(0.0, 26.0)"]}} +{"task_id": "f_425", "prompt": "import sqlite3\nfrom random import choice, seed\nimport os\n\n\ndef f_425(db_name, table_name, num_entries, random_seed=None):\n \"\"\"\n Create an SQLite3 table and fill it with random data using the provided database and table names.\n\n The function populates the table with columns 'name', 'age', 'height' using random data from the\n following constants:\n - NAMES: List of names ['John', 'Jane', 'Steve', 'Emma', 'Liam', 'Olivia']\n - AGES: Range of ages from 18 to 65.\n - HEIGHTS: Range of heights from 150cm to 200cm.\n\n Parameters:\n db_name (str): The name of the SQLite3 database.\n table_name (str): The name of the table to create and populate.\n num_entries (int): The number of entries to insert. Must not be negative.\n random_seed (int, optional): The seed for generating random values. Default is None.\n\n Returns:\n str: The absolute path of the SQLite3 database file.\n\n Requirements:\n - sqlite3\n - random.choice\n - random.seed\n - os\n\n Example:\n >>> db_path = f_425('test.db', 'People', 100, random_seed=42)\n >>> print(db_path)\n '/absolute/path/to/test.db'\n \"\"\"", "canonical_solution": " NAMES = [\"John\", \"Jane\", \"Steve\", \"Emma\", \"Liam\", \"Olivia\"]\n AGES = range(18, 65)\n HEIGHTS = range(150, 200)\n\n if random_seed:\n seed(random_seed)\n\n if num_entries < 0:\n raise ValueError(\"num_entries must not be negative\")\n\n conn = sqlite3.connect(db_name)\n cur = conn.cursor()\n cur.execute(f\"CREATE TABLE {table_name} (name TEXT, age INTEGER, height INTEGER)\")\n\n for _ in range(num_entries):\n name = choice(NAMES)\n age = choice(AGES)\n height = choice(HEIGHTS)\n cur.execute(f\"INSERT INTO {table_name} VALUES (?, ?, ?)\", (name, age, height))\n\n conn.commit()\n return os.path.abspath(db_name)", "test": "import unittest\nimport sqlite3\nimport os\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.temp_dir_path = self.temp_dir.name\n self.db_name = \"test_function.db\"\n self.db_path = os.path.join(self.temp_dir_path, self.db_name)\n self.table_name = \"TestTable\"\n self.random_seed = 42\n def tearDown(self):\n self.temp_dir.cleanup()\n def test_case_1(self):\n # Test basic case\n num_entries = 5\n db_path = f_425(\n self.db_path, self.table_name, num_entries, random_seed=self.random_seed\n )\n self.assertTrue(os.path.exists(db_path))\n self.verify_db_content(num_entries)\n def test_case_2(self):\n # Test handling 0 entries\n num_entries = 0\n db_path = f_425(\n self.db_path, self.table_name, num_entries, random_seed=self.random_seed\n )\n self.assertTrue(os.path.exists(db_path))\n self.verify_db_content(num_entries)\n def test_case_3(self):\n # Test handling 1 entry\n num_entries = 1\n db_path = f_425(\n self.db_path, self.table_name, num_entries, random_seed=self.random_seed\n )\n self.assertTrue(os.path.exists(db_path))\n self.verify_db_content(num_entries)\n def test_case_4(self):\n # Test handling invalid num_entries\n with self.assertRaises(Exception):\n f_425(self.db_path, self.table_name, -1, random_seed=self.random_seed)\n with self.assertRaises(Exception):\n f_425(self.db_path, self.table_name, \"1\", random_seed=self.random_seed)\n def test_case_5(self):\n # Test invalid table names (SQL keywords)\n with self.assertRaises(sqlite3.OperationalError):\n f_425(self.db_path, \"Select\", 10)\n def test_case_6(self):\n # Test against SQL injection in table_name parameter\n malicious_name = \"Test; DROP TABLE IntegrityCheck;\"\n with self.assertRaises(sqlite3.OperationalError):\n f_425(self.db_path, malicious_name, 1)\n def verify_db_content(self, num_entries):\n # Connect to the database and check if the table has correct number of entries\n conn = sqlite3.connect(self.db_path)\n cur = conn.cursor()\n cur.execute(f\"SELECT COUNT(*) FROM {self.table_name}\")\n count = cur.fetchone()[0]\n self.assertEqual(count, num_entries)\n # Verify data integrity\n cur.execute(f\"SELECT name, age, height FROM {self.table_name}\")\n rows = cur.fetchall()\n for row in rows:\n self.assertIn(row[0], [\"John\", \"Jane\", \"Steve\", \"Emma\", \"Liam\", \"Olivia\"])\n self.assertIn(row[1], list(range(18, 65)))\n self.assertIn(row[2], list(range(150, 200)))", "apis": ["sqlite3.connect", "os.path", "random.seed", "random.choice", "os.path.abspath"], "libs": ["os", "random", "sqlite3"], "doc": {"description": ["Create an SQLite3 table and fill it with random data using the provided database and table names.", "The function populates the table with columns 'name', 'age', 'height' using random data from the", "following constants:", "- NAMES: List of names ['John', 'Jane', 'Steve', 'Emma', 'Liam', 'Olivia']", "- AGES: Range of ages from 18 to 65.", "- HEIGHTS: Range of heights from 150cm to 200cm."], "note": [], "params": ["db_name (str): The name of the SQLite3 database.", "table_name (str): The name of the table to create and populate.", "num_entries (int): The number of entries to insert. Must not be negative.", "random_seed (int, optional): The seed for generating random values. Default is None."], "returns": ["str: The absolute path of the SQLite3 database file."], "reqs": ["sqlite3", "random.choice", "random.seed", "os"], "raises": [], "example": [">>> db_path = f_425('test.db', 'People', 100, random_seed=42)", ">>> print(db_path)", "'/absolute/path/to/test.db'"]}} +{"task_id": "f_408", "prompt": "import collections\nimport matplotlib.pyplot as plt\n\n\ndef f_408(data):\n \"\"\"\n Combine a list of dictionaries with the same keys (fruit names) into a single dictionary,\n calculate the total turnover for each fruit, and return a bar chart's axes with colors representing\n different fruits. The colors are selected from: 'red', 'yellow', 'green', 'blue', 'purple'. The function\n ensures that sales quantity must not be negative, throwing a ValueError if encountered.\n\n Parameters:\n data (list): A list of dictionaries. The keys are fruit names and the values are sales quantities.\n Sales quantity must not be negative.\n\n Returns:\n total_sales (dict): A dictionary containing the total sales for each fruit.\n ax (matplotlib.container.BarContainer): A bar chart of total fruit sales, or None if data is empty\n\n Requirements:\n - collections\n - matplotlib.pyplot\n\n Example:\n >>> sales, plot = f_408([{'apple': 10, 'banana': 15, 'cherry': 12},\\\n {'apple': 12, 'banana': 20, 'cherry': 14},\\\n {'apple': 15, 'banana': 18, 'cherry': 15},\\\n {'apple': 11, 'banana': 17, 'cherry': 13}])\n >>> sales\n {'apple': 48, 'banana': 70, 'cherry': 54}\n >>> type(plot)\n \n \"\"\"", "canonical_solution": " if not data:\n return dict(), None\n\n all_keys = set().union(*data)\n for d in data:\n for k, v in d.items():\n if v < 0:\n raise ValueError(\"Sales quantity must not be negative.\")\n\n combined_dict = dict((k, [d.get(k, 0) for d in data]) for k in all_keys)\n total_sales = {k: sum(v) for k, v in combined_dict.items()}\n total_sales = dict(collections.OrderedDict(sorted(total_sales.items())))\n labels, values = zip(*total_sales.items())\n\n # Define colors dynamically to handle different numbers of fruit types\n colors = [\"red\", \"yellow\", \"green\", \"blue\", \"purple\"] * (len(labels) // 5 + 1)\n\n ax = plt.bar(labels, values, color=colors[: len(labels)])\n plt.xlabel(\"Fruit\")\n plt.ylabel(\"Total Sales\")\n plt.title(\"Total Fruit Sales\")\n\n return total_sales, ax", "test": "import unittest\nimport collections\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case with one fruit\n data = [{\"apple\": 5}, {\"apple\": 7}, {\"apple\": 3}]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 15}\n self.assertDictEqual(sales, expected_sales)\n def test_case_2(self):\n # Test basic case with multiple fruits\n data = [\n {\"apple\": 10, \"banana\": 15, \"cherry\": 12, \"date\": 10},\n {\"apple\": 12, \"banana\": 20, \"cherry\": 14, \"date\": 9},\n {\"apple\": 15, \"banana\": 18, \"cherry\": 15, \"date\": 8},\n {\"apple\": 11, \"banana\": 17, \"cherry\": 13, \"date\": 7},\n ]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 48, \"banana\": 70, \"cherry\": 54, \"date\": 34}\n self.assertDictEqual(sales, expected_sales)\n def test_case_3(self):\n # Test basic case with one entry per fruit\n data = [{\"apple\": 1}, {\"banana\": 2}, {\"cherry\": 3}]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 1, \"banana\": 2, \"cherry\": 3}\n self.assertDictEqual(sales, expected_sales)\n def test_case_4(self):\n # Test zero quantities\n data = [\n {\"apple\": 0, \"banana\": 0},\n {\"apple\": 0, \"banana\": 0},\n {\"apple\": 0, \"banana\": 0},\n ]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 0, \"banana\": 0}\n self.assertDictEqual(sales, expected_sales)\n def test_case_5(self):\n # Test empty data\n data = []\n sales, _ = f_408(data)\n expected_sales = {}\n self.assertDictEqual(sales, expected_sales)\n def test_case_6(self):\n # Test missing fruit\n data = [{\"apple\": 10, \"banana\": 5}, {\"banana\": 15, \"cherry\": 7}, {\"cherry\": 3}]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 10, \"banana\": 20, \"cherry\": 10}\n self.assertDictEqual(sales, expected_sales)\n def test_case_7(self):\n # Test negative sales\n data = [{\"apple\": -10, \"banana\": 15}, {\"apple\": 12, \"banana\": -20}]\n with self.assertRaises(ValueError):\n f_408(data)\n def test_case_8(self):\n # Test large values\n data = [\n {\"apple\": 1000000, \"banana\": 500000},\n {\"apple\": 2000000, \"banana\": 1500000},\n ]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 3000000, \"banana\": 2000000}\n self.assertDictEqual(sales, expected_sales)\n def test_case_9(self):\n # Test visualization\n data = [{\"apple\": 10, \"banana\": 15}, {\"banana\": 5, \"apple\": 10}]\n _, plot = f_408(data)\n self.assertEqual(\n len(plot.patches), 2\n ) # Checking if the number of bars in the plot is correct\n def test_case_10(self):\n # Test non-string keys\n data = [{5: 10, \"banana\": 15}, {\"banana\": 5, 5: 10}]\n with self.assertRaises(TypeError):\n f_408(data)\n def test_case_11(self):\n # Test mixed types in sales\n data = [{\"apple\": 10.5, \"banana\": 15}, {\"apple\": 12, \"banana\": 20.5}]\n sales, _ = f_408(data)\n expected_sales = {\"apple\": 22.5, \"banana\": 35.5}\n self.assertDictEqual(sales, expected_sales)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.bar", "collections.OrderedDict", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.xlabel"], "libs": ["collections", "matplotlib"], "doc": {"description": ["Combine a list of dictionaries with the same keys (fruit names) into a single dictionary,", "calculate the total turnover for each fruit, and return a bar chart's axes with colors representing", "different fruits. The colors are selected from: 'red', 'yellow', 'green', 'blue', 'purple'. The function", "ensures that sales quantity must not be negative, throwing a ValueError if encountered."], "note": [], "params": ["data (list): A list of dictionaries. The keys are fruit names and the values are sales quantities.", "Sales quantity must not be negative."], "returns": ["total_sales (dict): A dictionary containing the total sales for each fruit.", "ax (matplotlib.container.BarContainer): A bar chart of total fruit sales, or None if data is empty"], "reqs": ["collections", "matplotlib.pyplot"], "raises": [], "example": [">>> sales, plot = f_408([{'apple': 10, 'banana': 15, 'cherry': 12},\\", "{'apple': 12, 'banana': 20, 'cherry': 14},\\", "{'apple': 15, 'banana': 18, 'cherry': 15},\\", "{'apple': 11, 'banana': 17, 'cherry': 13}])", ">>> sales", "{'apple': 48, 'banana': 70, 'cherry': 54}", ">>> type(plot)", ""]}} +{"task_id": "f_332", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_332(data):\n \"\"\"Scales numeric columns of a data dictionary using the StandardScaler.\n\n This function scales the numeric columns of a dataframe using the StandardScaler from scikit-learn.\n Non-numeric columns remain unchanged. If a column contains mixed data types, it tries to convert the entire column\n to float. If any value in the column cannot be converted to float, the entire column is left unchanged.\n\n Requirements:\n - pandas\n - sklearn.preprocessing.StandardScaler\n \n Parameters:\n - data (dict): Input data.\n\n Returns:\n - pd.DataFrame: Dataframe with scaled numeric columns.\n\n Example:\n >>> result = f_332({'x': [10, 20, 30, 40]})\n >>> result\n x\n 0 -1.341641\n 1 -0.447214\n 2 0.447214\n 3 1.341641\n >>> result2 = f_332({'a': [10.5, 23.4, 15.6, 78.9],'b': [45.6, 67.8, 89.0, 12.3],'c': ['apple', 'banana', 'cherry', 'date']})\n >>> result2\n a b c\n 0 -0.788098 -0.284409 apple\n 1 -0.317428 0.497496 banana\n 2 -0.602019 1.244180 cherry\n 3 1.707546 -1.457267 date\n \"\"\"", "canonical_solution": " dataframe = pd.DataFrame(data)\n # Initialize the scaler\n scaler = StandardScaler()\n\n # Iterate over columns and scale if they are numeric\n for column in dataframe.columns:\n if dataframe[column].dtype in [\"float64\", \"int64\"]:\n dataframe[column] = scaler.fit_transform(\n dataframe[column].values.reshape(-1, 1)\n )\n else:\n # Attempt to convert the entire column to float and then scale\n converted_column = dataframe[column].apply(pd.to_numeric, errors=\"coerce\")\n if (\n not converted_column.isna().all()\n ): # If all values are convertible to float\n dataframe[column] = scaler.fit_transform(\n converted_column.values.reshape(-1, 1)\n )\n return dataframe", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n \"\"\"Test the correctness of the scaling applied by the function.\"\"\"\n # Creating a sample dataframe with three numeric columns\n data = {\n \"a\": [10.5, 23.4, 15.6, 78.9],\n \"b\": [45.6, 67.8, 89.0, 12.3],\n \"c\": [12.3, 45.6, 78.9, 0.1],\n }\n df = pd.DataFrame(\n data\n )\n result = f_332(data)\n # Checking if the mean of scaled columns is approximately 0 and standard deviation is approximately 1\n self.assertTrue(np.isclose(result[\"a\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(result[\"b\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(np.std(result[\"a\"]), 1, atol=1e-2))\n self.assertTrue(np.isclose(np.std(result[\"b\"]), 1, atol=1e-2))\n def test_case_2(self):\n \"\"\"Test with an empty DataFrame.\"\"\"\n # Creating an empty dataframe\n data = {}\n df = pd.DataFrame(data)\n result = f_332(data)\n # Ensuring the result is also an empty dataframe\n self.assertTrue(result.empty)\n def test_case_3(self):\n \"\"\"Test with a DataFrame that doesn't have any columns to scale.\"\"\"\n # Creating a dataframe with a single non-numeric column\n data = {\"c\": [\"foo\", \"bar\"]}\n df = pd.DataFrame(data)\n result = f_332(data)\n # Ensuring the output dataframe is unchanged\n pd.testing.assert_frame_equal(result, df, check_dtype=False)\n def test_case_4(self):\n \"\"\"Test with a DataFrame where all columns are to be scaled.\"\"\"\n # Creating a dataframe with two numeric columns\n data = {\"a\": [10.5, 23.4, 15.6, 78.9], \"b\": [45.6, 67.8, 89.0, 12.3]}\n df = pd.DataFrame(\n data\n )\n result = f_332(data)\n # Checking if the mean of scaled columns is approximately 0 and standard deviation is approximately 1\n self.assertTrue(np.isclose(result[\"a\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(result[\"b\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(np.std(result[\"a\"]), 1, atol=1e-2))\n self.assertTrue(np.isclose(np.std(result[\"b\"]), 1, atol=1e-2))\n def test_case_5(self):\n \"\"\"Test with a DataFrame with single rows.\"\"\"\n # Creating a dataframe with a single row and three columns\n data = {\"a\": [5.5], \"b\": [8.6], \"c\": [7.7]}\n df = pd.DataFrame(data)\n result = f_332(data)\n self.assertDictEqual(result.to_dict(), {'a': {0: 0.0}, 'b': {0: 0.0}, 'c': {0: 0.0}})\n def test_case_6(self):\n \"\"\"Test with a DataFrame with mixed datatypes.\"\"\"\n # Creating a dataframe with mixed data types (both floats and strings) in columns\n data = {\n \"a\": [10.5, 23.4, 15.6, \"78.9\"],\n \"b\": [45.6, \"67.8\", 89.0, 12.3],\n \"c\": [12.3, 45.6, 78.9, \"0.1\"],\n }\n df = pd.DataFrame(\n data\n )\n result = f_332(data)\n # Checking if the mean of scaled columns is approximately 0 and standard deviation is approximately 1\n self.assertTrue(np.isclose(result[\"a\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(result[\"b\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(np.std(result[\"a\"]), 1, atol=1e-2))\n self.assertTrue(np.isclose(np.std(result[\"b\"]), 1, atol=1e-2))\n def test_case_7(self):\n \"\"\"Test with a DataFrame with negative values.\"\"\"\n # Creating a dataframe with negative values in columns\n data = {\"a\": [-1, -2, -3, -4], \"b\": [-4, -5, -6, -7], \"c\": [-7, -8, -9, -10]}\n df = pd.DataFrame(\n data\n )\n result = f_332(data)\n # Checking if the mean of scaled columns is approximately 0 and standard deviation is approximately 1\n self.assertTrue(np.isclose(result[\"a\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(result[\"b\"].mean(), 0, atol=1e-7))\n self.assertTrue(np.isclose(np.std(result[\"a\"]), 1, atol=1e-2))\n self.assertTrue(np.isclose(np.std(result[\"b\"]), 1, atol=1e-2))", "apis": ["pandas.DataFrame", "sklearn.preprocessing.StandardScaler", "pandas.to_numeric"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Scales numeric columns of a data dictionary using the StandardScaler.", "This function scales the numeric columns of a dataframe using the StandardScaler from scikit-learn.", "Non-numeric columns remain unchanged. If a column contains mixed data types, it tries to convert the entire column", "to float. If any value in the column cannot be converted to float, the entire column is left unchanged."], "note": [], "params": ["data (dict): Input data."], "returns": ["pd.DataFrame: Dataframe with scaled numeric columns."], "reqs": ["pandas", "sklearn.preprocessing.StandardScaler"], "raises": [], "example": [">>> result = f_332({'x': [10, 20, 30, 40]})", ">>> result", "x", "0 -1.341641", "1 -0.447214", "2 0.447214", "3 1.341641", ">>> result2 = f_332({'a': [10.5, 23.4, 15.6, 78.9],'b': [45.6, 67.8, 89.0, 12.3],'c': ['apple', 'banana', 'cherry', 'date']})", ">>> result2", "a b c", "0 -0.788098 -0.284409 apple", "1 -0.317428 0.497496 banana", "2 -0.602019 1.244180 cherry", "3 1.707546 -1.457267 date"]}} +{"task_id": "f_389", "prompt": "import pandas as pd\nfrom datetime import datetime\nimport random\n\n\ndef f_389(\n epoch_milliseconds,\n random_seed=0,\n products=[\"Product1\", \"Product2\", \"Product3\", \"Product4\", \"Product5\"],\n):\n \"\"\"\n Generate sales data for five products from a given epoch time up to the current time.\n\n This function checks input validity, then for each day between the date of the given epoch\n time to the date of the current time, generates random sales data for each of the 5 products.\n\n Parameters:\n - epoch_milliseconds (int): Start epoch time in milliseconds. Must be before current system time.\n - random_seed (int): Seed for reproducibility of random sales data. Defaults to 0.\n - products (list of str): Product list to choose from. Must contain 5 unique strings.\n Defaults to ['Product1', 'Product2', 'Product3', 'Product4', 'Product5'].\n\n Returns:\n - pd.DataFrame: A DataFrame containing sales data with columns 'Product' (string), 'Date' (datetime),\n and 'Sales' (integer). Sales quantity is randomly sampled from range [10, 50].\n\n Requirements:\n - pandas\n - datetime.datetime\n - random\n\n Example:\n >>> sales_data = f_389(1236472051807, random_seed=42)\n >>> type(sales_data)\n \n >>> sales_data.head()\n Product Date Sales\n 0 Product4 2009-03-08 11:27:31.807 50\n 1 Product5 2009-03-08 11:27:31.807 17\n 2 Product1 2009-03-08 11:27:31.807 11\n 3 Product3 2009-03-08 11:27:31.807 27\n 4 Product2 2009-03-08 11:27:31.807 25\n \"\"\"", "canonical_solution": " random.seed(random_seed)\n\n products = list(set(products))\n if len(products) != 5:\n raise ValueError(\"Products must contain 5 unique items\")\n\n start_date = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n end_date = datetime.now()\n if start_date >= end_date:\n raise ValueError(\"Start time must be before current system time\")\n\n date_range = pd.date_range(start_date, end_date, freq=\"D\")\n sales_data = []\n for date in date_range:\n for product in products:\n sales = random.randint(10, 50)\n sales_data.append([product, date, sales])\n\n df = pd.DataFrame(sales_data, columns=[\"Product\", \"Date\", \"Sales\"])\n return df", "test": "import unittest\nfrom datetime import datetime, timedelta\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n sales_data = f_389(1631289600000, random_seed=42)\n self.assertListEqual(list(sales_data.columns), [\"Product\", \"Date\", \"Sales\"])\n self.assertEqual(\n sales_data[\"Date\"].iloc[0], datetime.fromtimestamp(1631289600000 / 1000.0)\n )\n self.assertListEqual(\n sorted(list(sales_data[\"Product\"].unique())),\n [\"Product1\", \"Product2\", \"Product3\", \"Product4\", \"Product5\"],\n )\n def test_case_2(self):\n # Test 3 days ago\n three_days_ago = (datetime.now() - timedelta(days=3)).timestamp() * 1000\n sales_data = f_389(three_days_ago, random_seed=42)\n self.assertListEqual(list(sales_data.columns), [\"Product\", \"Date\", \"Sales\"])\n self.assertEqual(\n sales_data[\"Date\"].iloc[0], datetime.fromtimestamp(three_days_ago / 1000.0)\n )\n self.assertListEqual(\n sorted(list(sales_data[\"Product\"].unique())),\n [\"Product1\", \"Product2\", \"Product3\", \"Product4\", \"Product5\"],\n )\n def test_case_3(self):\n # Test 1 month ago\n one_month_ago = (datetime.now() - timedelta(days=30)).timestamp() * 1000\n sales_data = f_389(one_month_ago, random_seed=42)\n self.assertListEqual(list(sales_data.columns), [\"Product\", \"Date\", \"Sales\"])\n self.assertEqual(\n sales_data[\"Date\"].iloc[0], datetime.fromtimestamp(one_month_ago / 1000.0)\n )\n self.assertListEqual(\n sorted(list(sales_data[\"Product\"].unique())),\n [\"Product1\", \"Product2\", \"Product3\", \"Product4\", \"Product5\"],\n )\n def test_case_4(self):\n # Test custom products\n custom_products = [\"apple\", \"banana\", \"carrot\", \"durian\", \"eggplant\"]\n sales_data = f_389(1577836800000, random_seed=42, products=custom_products)\n self.assertListEqual(list(sales_data.columns), [\"Product\", \"Date\", \"Sales\"])\n self.assertEqual(\n sales_data[\"Date\"].iloc[0], datetime.fromtimestamp(1577836800000 / 1000.0)\n )\n self.assertListEqual(\n sorted(list(sales_data[\"Product\"].unique())), custom_products\n )\n def test_case_5(self):\n # Test handling invalid time - future\n with self.assertRaises(ValueError):\n f_389(int((datetime.now() + timedelta(days=1)).timestamp() * 1000))\n def test_case_6(self):\n # Test handling invalid products - 4 unique items\n with self.assertRaises(ValueError):\n f_389(1631289600000, products=[\"this\", \"is\", \"too\", \"short\"])\n def test_case_7(self):\n # Test handling invalid products - 5 items but with duplicates\n with self.assertRaises(ValueError):\n f_389(1631289600000, products=[\"a\", \"a\", \"b\", \"c\", \"d\"])", "apis": ["datetime.datetime.fromtimestamp", "pandas.DataFrame", "datetime.datetime.now", "random.randint", "pandas.date_range", "random.seed"], "libs": ["random", "pandas", "datetime"], "doc": {"description": ["Generate sales data for five products from a given epoch time up to the current time.", "This function checks input validity, then for each day between the date of the given epoch", "time to the date of the current time, generates random sales data for each of the 5 products."], "note": [], "params": ["epoch_milliseconds (int): Start epoch time in milliseconds. Must be before current system time.", "random_seed (int): Seed for reproducibility of random sales data. Defaults to 0.", "products (list of str): Product list to choose from. Must contain 5 unique strings.", "Defaults to ['Product1', 'Product2', 'Product3', 'Product4', 'Product5']."], "returns": ["pd.DataFrame: A DataFrame containing sales data with columns 'Product' (string), 'Date' (datetime),", "and 'Sales' (integer). Sales quantity is randomly sampled from range [10, 50]."], "reqs": ["pandas", "datetime.datetime", "random"], "raises": [], "example": [">>> sales_data = f_389(1236472051807, random_seed=42)", ">>> type(sales_data)", "", ">>> sales_data.head()", "Product Date Sales", "0 Product4 2009-03-08 11:27:31.807 50", "1 Product5 2009-03-08 11:27:31.807 17", "2 Product1 2009-03-08 11:27:31.807 11", "3 Product3 2009-03-08 11:27:31.807 27", "4 Product2 2009-03-08 11:27:31.807 25"]}} +{"task_id": "f_931", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\ndef f_931(mean=123456.908, std_dev=1.2, save_plots=False):\n \"\"\"\n Generate a random sample from a normal distribution, analyze its skewness and kurtosis,\n and create a histogram and a QQ plot to visualize the distribution.\n\n Parameters:\n - mean (float, optional): Mean of the normal distribution. Defaults to 123456.908.\n - std_dev (float, optional): Standard deviation of the normal distribution. Defaults to 1.2.\n - save_plots (bool, optional): If True, saves the plots to files. Defaults to False.\n\n Returns:\n - float: Skewness of the sample.\n - float: Kurtosis of the sample.\n - list: Paths to the saved plot files, empty if save_plots is False.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - scipy.stats\n\n Example:\n >>> np.random.seed(0)\n >>> skewness, kurtosis, plot_paths = f_931(123456.908, 1.2, True)\n >>> print(f'Skewness: {skewness}, Kurtosis: {kurtosis}, Plots: {plot_paths}')\n Skewness: 0.03385895323538189, Kurtosis: -0.04676632447765128, Plots: ['histogram_plot.png', 'qq_plot.png']\n\n \"\"\"", "canonical_solution": " sample = np.random.normal(mean, std_dev, 1000)\n plot_paths = []\n\n # Plotting histogram\n plt.figure()\n plt.hist(sample, bins=50)\n if save_plots:\n hist_path = \"histogram_plot.png\"\n plt.savefig(hist_path)\n plt.close()\n plot_paths.append(hist_path)\n\n # Plotting QQ diagram\n plt.figure()\n stats.probplot(sample, plot=plt)\n if save_plots:\n qq_path = \"qq_plot.png\"\n plt.savefig(qq_path)\n plt.close()\n plot_paths.append(qq_path)\n\n skewness = stats.skew(sample)\n kurtosis = stats.kurtosis(sample)\n\n return skewness, kurtosis, plot_paths", "test": "import unittest\nimport os\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_931.\"\"\"\n def test_default_parameters(self):\n \"\"\"\n Test f_931 with default parameters.\n \"\"\"\n np.random.seed(0)\n skewness, kurtosis, plot_paths = f_931()\n self.assertAlmostEqual(skewness, 0, delta=0.5)\n self.assertAlmostEqual(kurtosis, 0, delta=0.5)\n self.assertEqual(len(plot_paths), 0)\n def test_save_plots_true(self):\n \"\"\"\n Test f_931 with save_plots set to True.\n \"\"\"\n np.random.seed(1)\n _, _, plot_paths = f_931(save_plots=True)\n self.assertEqual(len(plot_paths), 2)\n for path in plot_paths:\n self.assertTrue(os.path.exists(path))\n os.remove(path) # Clean up: remove created files\n def test_custom_mean_std_dev(self):\n \"\"\"\n Test f_931 with custom mean and standard deviation.\n \"\"\"\n np.random.seed(2)\n mean = 100\n std_dev = 10\n skewness, kurtosis, _ = f_931(mean, std_dev)\n self.assertAlmostEqual(skewness, 0, delta=1)\n self.assertAlmostEqual(kurtosis, 0, delta=1)\n def test_negative_std_dev(self):\n \"\"\"\n Test f_931 with a negative standard deviation.\n \"\"\"\n np.random.seed(3)\n with self.assertRaises(ValueError):\n f_931(std_dev=-1)\n def test_large_sample(self):\n \"\"\"\n Test f_931 with a larger sample size.\n \"\"\"\n np.random.seed(4)\n _, _, plot_paths = f_931(mean=1000, std_dev=50, save_plots=True)\n self.assertEqual(len(plot_paths), 2)\n for path in plot_paths:\n self.assertTrue(os.path.exists(path))\n os.remove(path) # Clean up: remove created files", "apis": ["matplotlib.pyplot.figure", "scipy.stats.kurtosis", "numpy.random", "matplotlib.pyplot.savefig", "matplotlib.pyplot.hist", "scipy.stats.probplot", "scipy.stats.skew", "matplotlib.pyplot.close", "numpy.random.normal"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Generate a random sample from a normal distribution, analyze its skewness and kurtosis,", "and create a histogram and a QQ plot to visualize the distribution."], "note": [], "params": ["mean (float, optional): Mean of the normal distribution. Defaults to 123456.908.", "std_dev (float, optional): Standard deviation of the normal distribution. Defaults to 1.2.", "save_plots (bool, optional): If True, saves the plots to files. Defaults to False."], "returns": ["float: Skewness of the sample.", "float: Kurtosis of the sample.", "list: Paths to the saved plot files, empty if save_plots is False."], "reqs": ["numpy", "matplotlib.pyplot", "scipy.stats"], "raises": [], "example": [">>> np.random.seed(0)", ">>> skewness, kurtosis, plot_paths = f_931(123456.908, 1.2, True)", ">>> print(f'Skewness: {skewness}, Kurtosis: {kurtosis}, Plots: {plot_paths}')", "Skewness: 0.03385895323538189, Kurtosis: -0.04676632447765128, Plots: ['histogram_plot.png', 'qq_plot.png']"]}} +{"task_id": "f_753", "prompt": "from functools import reduce\nimport operator\nimport string\n\ndef f_753(letters):\n \"\"\"\n Calculate the product of the corresponding numbers for a list of uppercase letters, \n where \\\"A\\\" corresponds to 1, \\\"B\\\" to 2, etc.\n \n Parameters:\n letters (list of str): A list of uppercase letters.\n \n Returns:\n int: The product of the numbers corresponding to the input letters.\n \n Requirements:\n - functools.reduce\n - operator\n - string\n \n Examples:\n >>> f_753([\\\"A\\\", \\\"B\\\", \\\"C\\\"])\n 6\n \n >>> f_753([\\\"A\\\", \\\"E\\\", \\\"I\\\"])\n 45\n \n Note:\n The function uses a predefined dictionary to map each uppercase letter to its corresponding number.\n \"\"\"", "canonical_solution": " # Creating a dictionary to map each letter to its corresponding number\n letter_to_number = {letter: i+1 for i, letter in enumerate(string.ascii_uppercase)}\n \n # Convert the letters to numbers\n numbers = [letter_to_number[letter] for letter in letters]\n \n # Calculate the product using functools.reduce and operator.mul\n product = reduce(operator.mul, numbers, 1)\n \n return product", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Input: [\"A\", \"B\", \"C\"]\n # Expected Output: 6 (1 * 2 * 3)\n result = f_753([\"A\", \"B\", \"C\"])\n self.assertEqual(result, 6)\n \n def test_case_2(self):\n # Input: [\"A\", \"E\", \"I\"]\n # Expected Output: 45 (1 * 5 * 9)\n result = f_753([\"A\", \"E\", \"I\"])\n self.assertEqual(result, 45)\n def test_case_3(self):\n # Input: [\"Z\"]\n # Expected Output: 26\n result = f_753([\"Z\"])\n self.assertEqual(result, 26)\n def test_case_4(self):\n # Input: [\"X\", \"Y\", \"Z\"]\n # Expected Output: 24 * 25 * 26\n result = f_753([\"X\", \"Y\", \"Z\"])\n self.assertEqual(result, 24 * 25 * 26)\n \n def test_case_5(self):\n # Input: [\"A\", \"A\", \"A\"]\n # Expected Output: 1 (1 * 1 * 1)\n result = f_753([\"A\", \"A\", \"A\"])\n self.assertEqual(result, 1)", "apis": ["string.ascii_uppercase", "functools.reduce", "operator.mul"], "libs": ["operator", "functools", "string"], "doc": {"description": ["Calculate the product of the corresponding numbers for a list of uppercase letters,", "where \\\"A\\\" corresponds to 1, \\\"B\\\" to 2, etc.", ">>> f_753([\\\"A\\\", \\\"E\\\", \\\"I\\\"])", "45"], "note": ["The function uses a predefined dictionary to map each uppercase letter to its corresponding number."], "params": ["letters (list of str): A list of uppercase letters."], "returns": ["int: The product of the numbers corresponding to the input letters."], "reqs": ["functools.reduce", "operator", "string"], "raises": [], "example": ["Examples:", ">>> f_753([\\\"A\\\", \\\"B\\\", \\\"C\\\"])", "6"]}} +{"task_id": "f_822", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\n\n\ndef f_822(\n feature_array,\n target_array,\n feature_names=[\"f1\", \"f2\", \"f3\", \"f4\", \"f5\"],\n target_name=\"target\",\n seed=None,\n):\n \"\"\"\n Shuffle the columns of a given numpy array and train a Random Forest Classifier on the shuffled data.\n\n Parameters:\n - feature_array (numpy.ndarray): 2D array containing the feature data with shape (n_samples, n_features).\n - target_array (numpy.ndarray): 1D array containing the target data with shape (n_samples,).\n - feature_names (list of str, optional): Names of the features corresponding to the columns in `feature_array`.\n Defaults to ['f1', 'f2', 'f3', 'f4', 'f5'].\n - target_name (str, optional): Name of the target column. Defaults to 'target'.\n - seed (int, optional): Seed for the random number generator to make shuffling reproducible. Defaults to None.\n\n Returns:\n sklearn.ensemble.RandomForestClassifier: A trained Random Forest Classifier on the shuffled feature data.\n\n Requirements:\n - numpy\n - pandas\n - sklearn\n\n Examples:\n >>> feature_array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n >>> target_array = np.array([0, 1])\n >>> clf = f_822(feature_array, target_array)\n >>> type(clf)\n \n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n\n shuffled_array = feature_array.copy()\n np.random.shuffle(shuffled_array.T)\n\n df = pd.DataFrame(shuffled_array, columns=feature_names)\n df[target_name] = target_array\n\n clf = RandomForestClassifier()\n clf.fit(df[feature_names], df[target_name])\n\n return clf", "test": "import unittest\nimport numpy as np\nfrom sklearn.ensemble import RandomForestClassifier\nimport warnings\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n target = np.array([0, 1])\n clf = f_822(array, target)\n self.assertIsInstance(clf, RandomForestClassifier)\n self.assertTrue(len(clf.feature_importances_) > 0)\n self.assertEqual(set(np.unique(target)), set(clf.classes_))\n with warnings.catch_warnings():\n # Temporarily suppress warning - clf prefers named array\n warnings.simplefilter(\"ignore\", category=UserWarning)\n predictions = clf.predict(array)\n np.testing.assert_array_equal(\n predictions,\n target,\n \"The model's predictions do not match the expected target values.\",\n )\n def test_case_2(self):\n # Test identical features\n array = np.ones((10, 5))\n target = np.zeros(10)\n clf = f_822(array, target)\n self.assertTrue(len(clf.feature_importances_) > 0)\n def test_case_3(self):\n # Test all unique targets\n array = np.array([[i] * 5 for i in range(10)])\n target = np.arange(10)\n clf = f_822(array, target)\n self.assertEqual(len(np.unique(target)), len(clf.classes_))\n def test_case_4(self):\n # Test random seed reproducibility\n np.random.seed(0)\n array = np.random.rand(10, 5)\n target = np.random.randint(0, 2, 10)\n clf1 = f_822(array, target, seed=42)\n clf2 = f_822(array, target, seed=42)\n self.assertEqual(\n clf1.feature_importances_.tolist(), clf2.feature_importances_.tolist()\n )\n def test_case_5(self):\n # Test negative features\n array = np.array([[-1, -2, -3, -4, -5], [-6, -7, -8, -9, -10]])\n target = np.array([0, 1])\n clf = f_822(array, target)\n self.assertTrue(len(clf.feature_importances_) > 0)\n def test_case_6(self):\n # Test single feature array\n array = np.arange(10).reshape(-1, 1)\n target = np.array([0, 1] * 5)\n feature_names = [\"f1\"]\n clf = f_822(array, target, feature_names)\n self.assertTrue(len(clf.feature_importances_) > 0)\n def test_case_7(self):\n # Test exception handling for incompatible shapes among arrays\n array = np.array([[1, 2, 3], [4, 5, 6]])\n target = np.array([0, 1, 2])\n with self.assertRaises(ValueError):\n f_822(array, target)\n def test_case_8(self):\n # Test exception handling for incompatible feature_names vs array shape\n array = np.array([[1, 2, 3], [4, 5, 6]]) # 2x3 array\n target = np.array([0, 1])\n incorrect_feature_names = [\"f1\", \"f2\"] # Only 2 names for a 3-column array\n with self.assertRaises(ValueError):\n f_822(array, target, feature_names=incorrect_feature_names)\n def test_case_9(self):\n # Test custom feature names\n array = np.array([[7, 8], [9, 10]])\n target = np.array([0, 1])\n custom_feature_names = [\"custom1\", \"custom2\"]\n clf = f_822(array, target, feature_names=custom_feature_names)\n self.assertEqual(clf.feature_importances_.size, len(custom_feature_names))\n def test_case_10(self):\n # Test custom target name\n array = np.array([[11, 12, 13, 14, 15], [16, 17, 18, 19, 20]])\n target = np.array([1, 0])\n custom_target_name = \"custom_target\"\n clf = f_822(array, target, target_name=custom_target_name)\n # Check if the model was trained successfully\n self.assertTrue(len(clf.feature_importances_) > 0)", "apis": ["numpy.random.shuffle", "pandas.DataFrame", "numpy.random", "sklearn.ensemble.RandomForestClassifier", "numpy.random.seed"], "libs": ["pandas", "numpy", "sklearn"], "doc": {"description": ["Shuffle the columns of a given numpy array and train a Random Forest Classifier on the shuffled data."], "note": [], "params": ["feature_array (numpy.ndarray): 2D array containing the feature data with shape (n_samples, n_features).", "target_array (numpy.ndarray): 1D array containing the target data with shape (n_samples,).", "feature_names (list of str, optional): Names of the features corresponding to the columns in `feature_array`.", "Defaults to ['f1', 'f2', 'f3', 'f4', 'f5'].", "target_name (str, optional): Name of the target column. Defaults to 'target'.", "seed (int, optional): Seed for the random number generator to make shuffling reproducible. Defaults to None."], "returns": ["sklearn.ensemble.RandomForestClassifier: A trained Random Forest Classifier on the shuffled feature data."], "reqs": ["numpy", "pandas", "sklearn"], "raises": [], "example": ["Examples:", ">>> feature_array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])", ">>> target_array = np.array([0, 1])", ">>> clf = f_822(feature_array, target_array)", ">>> type(clf)", ""]}} +{"task_id": "f_882", "prompt": "from datetime import datetime\nimport json\n\nSERVER_ADDRESS = \"localhost\"\nBUFFER_SIZE = 1024\n\n\ndef f_882(client_socket):\n \"\"\"\n Responds to a client's request by sending a JSON-formatted message containing\n the current server time and a greeting.\n\n Parameters:\n - client_socket (socket.socket): The client socket from which the request is received.\n\n Requirements:\n - datetime.datetime\n - json\n\n Returns:\n - None\n\n Example:\n >>> import socket\n >>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n >>> server_socket.bind((SERVER_ADDRESS, 0)) # Bind to a free port\n >>> server_socket.bind((SERVER_ADDRESS, 8080))\n >>> server_socket.listen(1)\n >>> try:\n ... client_socket, _ = server_socket.accept()\n ... f_882(client_socket)\n ... finally:\n ... server_socket.close()\n \"\"\"", "canonical_solution": " response_data = {\"message\": \"Hello\", \"time\": str(datetime.now())}\n response = json.dumps(response_data) + \"\\n\"\n client_socket.send(response.encode(\"utf-8\"))\n client_socket.close()", "test": "import unittest\nimport socket\nimport threading\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_882.\"\"\"\n def setUp(self):\n \"\"\"Set up a server socket for testing.\"\"\"\n self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n self.server_socket.bind((SERVER_ADDRESS, 0)) # Bind to a free port\n self.server_socket.listen(1)\n self.port = self.server_socket.getsockname()[1]\n def tearDown(self):\n \"\"\"Close the server socket after each test.\"\"\"\n self.server_socket.close()\n def client_thread_function(self, responses, request_message):\n \"\"\"Function to simulate a client sending a request and receiving a response.\"\"\"\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client_socket:\n client_socket.connect((SERVER_ADDRESS, self.port))\n client_socket.send(request_message + b\"\\n\") # Append end-of-message marker\n response = client_socket.recv(BUFFER_SIZE).decode()\n responses.append(response)\n def test_response_contains_greeting(self):\n \"\"\"Test if the response from the server contains a greeting.\"\"\"\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function, args=(responses, b\"Test request\")\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n # Ensure that responses is not empty before accessing it\n self.assertTrue(responses) # Check that responses is not empty\n self.assertIn(\"Hello\", responses[0])\n def test_handle_large_request(self):\n \"\"\"\n Test how the function handles a request larger than the buffer size.\n \"\"\"\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function,\n args=(responses, b\"a\" * (BUFFER_SIZE + 1)),\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n # Expecting a normal response despite a large request\n self.assertIn(\"Hello\", responses[0])\n def test_response_format(self):\n \"\"\"\n Test if the response format from the server is correct.\n \"\"\"\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function, args=(responses, b\"Format request\")\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n response_data = json.loads(responses[0])\n self.assertIn(\"time\", response_data)\n def test_handle_special_characters_request(self):\n \"\"\"\n Test how the function handles a request with special characters.\n \"\"\"\n special_request = b\"!@#$%^&*()_+\"\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function, args=(responses, special_request)\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n # Expecting a normal response despite a request with special characters\n self.assertIn(\"Hello\", responses[0])\n def test_handle_json_request(self):\n \"\"\"\n Test how the function handles a JSON-formatted request.\n \"\"\"\n json_request = {\"request\": \"time\"}\n json_request_encoded = json.dumps(json_request).encode(\"utf-8\")\n responses = []\n client_thread = threading.Thread(\n target=self.client_thread_function, args=(responses, json_request_encoded)\n )\n client_thread.start()\n client_socket, _ = self.server_socket.accept()\n f_882(client_socket)\n client_thread.join()\n # Expecting a normal response despite the JSON request\n self.assertIn(\"Hello\", responses[0])", "apis": ["datetime.datetime.now", "json.dumps"], "libs": ["json", "datetime"], "doc": {"description": ["Responds to a client's request by sending a JSON-formatted message containing", "the current server time and a greeting."], "note": [], "params": ["client_socket (socket.socket): The client socket from which the request is received."], "returns": ["None"], "reqs": ["datetime.datetime", "json"], "raises": [], "example": [">>> import socket", ">>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)", ">>> server_socket.bind((SERVER_ADDRESS, 0)) # Bind to a free port", ">>> server_socket.bind((SERVER_ADDRESS, 8080))", ">>> server_socket.listen(1)", ">>> try:", "... client_socket, _ = server_socket.accept()", "... f_882(client_socket)", "... finally:", "... server_socket.close()"]}} +{"task_id": "f_405", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.decomposition import PCA\n\n\ndef f_405(array: list, random_seed: int = 42) -> (pd.DataFrame, np.ndarray):\n \"\"\"\n Converts a 2D list into a pandas DataFrame and applies PCA for dimensionality reduction.\n\n This function creates a DataFrame from the provided 2D list and then applies PCA to reduce the dataset\n to its two main components. The function uses a fixed random seed to ensure reproducibility.\n\n Parameters:\n - array (list of list of int): A 2D list representing data rows and columns.\n - random_seed (int, optional): The seed for the random number generator. Default is 42.\n\n Returns:\n - pd.DataFrame: The original data in DataFrame format.\n - np.ndarray: The data after PCA transformation.\n\n Requirements:\n - pandas\n - numpy\n - sklearn.decomposition.PCA\n\n Examples:\n >>> data = [[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15]]\n >>> df, transformed = f_405(data)\n >>> print(df)\n 0 1 2 3 4\n 0 1 2 3 4 5\n 1 6 7 8 9 10\n 2 11 12 13 14 15\n >>> print(transformed)\n [[ 1.11803399e+01 8.88178420e-16]\n [-0.00000000e+00 -0.00000000e+00]\n [-1.11803399e+01 8.88178420e-16]]\n \"\"\"", "canonical_solution": " df = pd.DataFrame(array)\n\n pca = PCA(n_components=2, random_state=random_seed)\n transformed_data = pca.fit_transform(df)\n\n return df, transformed_data", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic 2-row dataset\n data = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (2, 2))\n def test_case_2(self):\n # Test basic 3-row dataset\n data = [[10, 20, 30, 40, 50], [60, 70, 80, 90, 100], [110, 120, 130, 140, 150]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (3, 2))\n def test_case_3(self):\n # Test mix of positive, negative, zero values\n data = [[-1, -2, -3, -4, -5], [5, 6, 7, 8, 9], [0, 0, 0, 0, 0]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (3, 2))\n def test_case_4(self):\n # Test 4-row dataset with incremental pattern\n data = [\n [5, 15, 25, 35, 45],\n [55, 65, 75, 85, 95],\n [105, 115, 125, 135, 145],\n [155, 165, 175, 185, 195],\n ]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (4, 2))\n def test_case_5(self):\n # Test uniform rows\n data = [[10, 10, 10, 10, 10], [20, 20, 20, 20, 20], [30, 30, 30, 30, 30]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (3, 2))\n def test_case_6(self):\n # Test single row (should fail since it's < n_components)\n with self.assertRaises(ValueError):\n data = [[1, 2, 3, 4, 5]]\n f_405(data)\n def test_case_7(self):\n # Test large numbers\n data = [[1000000000, 2000000000], [-1000000000, -2000000000]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (2, 2))\n def test_case_8(self):\n # Test correctness of PCA\n data = [[2, 3], [3, 4], [5, 6]]\n _, transformed_data = f_405(data)\n # Using the sklearn PCA output as the expected transformation\n expected_transformation = np.array(\n [\n [-1.88561808e00, 1.93816421e-16],\n [-4.71404521e-01, 3.32511118e-16],\n [2.35702260e00, 2.21555360e-16],\n ]\n )\n np.testing.assert_almost_equal(\n transformed_data, expected_transformation, decimal=5\n )\n def test_case_9(self):\n # Test floats\n data = [[1.5, 2.5], [3.5, 4.5], [5.5, 6.5]]\n df, transformed_data = f_405(data)\n expected_df = pd.DataFrame(data)\n self.assertTrue(df.equals(expected_df))\n self.assertEqual(transformed_data.shape, (3, 2))", "apis": ["pandas.DataFrame", "numpy.ndarray", "sklearn.decomposition.PCA"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Converts a 2D list into a pandas DataFrame and applies PCA for dimensionality reduction.", "This function creates a DataFrame from the provided 2D list and then applies PCA to reduce the dataset", "to its two main components. The function uses a fixed random seed to ensure reproducibility."], "note": [], "params": ["array (list of list of int): A 2D list representing data rows and columns.", "random_seed (int, optional): The seed for the random number generator. Default is 42."], "returns": ["pd.DataFrame: The original data in DataFrame format.", "np.ndarray: The data after PCA transformation."], "reqs": ["pandas", "numpy", "sklearn.decomposition.PCA"], "raises": [], "example": ["Examples:", ">>> data = [[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15]]", ">>> df, transformed = f_405(data)", ">>> print(df)", "0 1 2 3 4", "0 1 2 3 4 5", "1 6 7 8 9 10", "2 11 12 13 14 15", ">>> print(transformed)", "[[ 1.11803399e+01 8.88178420e-16]", "[-0.00000000e+00 -0.00000000e+00]", "[-1.11803399e+01 8.88178420e-16]]"]}} +{"task_id": "f_403", "prompt": "import pandas as pd\nimport seaborn as sns\n\n\ndef f_403(array):\n \"\"\"Generates a DataFrame and heatmap from a 2D list.\n\n This function takes a 2D list and returns a pandas DataFrame and a seaborn heatmap\n representing the correlation matrix of the DataFrame. Assumes sublists of length 5.\n Also assumes DataFrame columns: 'A', 'B', 'C', 'D', 'E'.\n\n Parameters:\n - array (list of list of int): 2D list with sublists of length 5. Must not be empty.\n\n Returns:\n - DataFrame: Constructed from the input 2D list.\n - heatmap: Seaborn heatmap of the DataFrame's correlation matrix.\n\n Requirements:\n - pandas\n - seaborn\n\n Example:\n >>> df, ax = f_403([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])\n >>> df\n A B C D E\n 0 1 2 3 4 5\n 1 5 4 3 2 1\n >>> ax\n \n \"\"\"", "canonical_solution": " COLUMNS = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n\n if not array or any(len(sublist) != 5 for sublist in array):\n raise ValueError(\"array must be non-empty and all sublists must have a length of 5.\")\n\n df = pd.DataFrame(array, columns=COLUMNS)\n heatmap = sns.heatmap(df.corr(), annot=True)\n return df, heatmap", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport random\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n random.seed(42)\n cls.mock_data = [[random.randint(1, 100) for _ in range(5)] for _ in range(5)]\n def test_case_1(self):\n # Test dataframe creation with valid input\n df, _ = f_403(self.mock_data)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.shape, (5, 5))\n def test_case_2(self):\n # Test heatmap creation with valid input\n _, heatmap = f_403(self.mock_data)\n self.assertIsNotNone(heatmap)\n def test_case_3(self):\n # Test correlation accuracy with known data\n correlated_data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]\n df, _ = f_403(correlated_data)\n corr_matrix = df.corr()\n np.testing.assert_array_almost_equal(\n corr_matrix, np.corrcoef(correlated_data, rowvar=False)\n )\n def test_case_4(self):\n # Test handling of non-numeric data\n with self.assertRaises(ValueError):\n f_403([[\"a\", \"b\", \"c\", \"d\", \"e\"], [1, 2, 3, 4, 5]])\n def test_case_5(self):\n # Test with empty list\n with self.assertRaises(ValueError):\n f_403([])\n def test_case_6(self):\n # Test with single sublist\n single_sublist = [[1, 2, 3, 4, 5]]\n df, _ = f_403(single_sublist)\n self.assertEqual(df.shape, (1, 5))\n def test_case_7(self):\n # Test handling sublists of varying lengths\n with self.assertRaises(ValueError):\n f_403([[1, 2, 3], [4, 5, 6, 7, 8]])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "seaborn.heatmap"], "libs": ["seaborn", "pandas"], "doc": {"description": ["Generates a DataFrame and heatmap from a 2D list.", "This function takes a 2D list and returns a pandas DataFrame and a seaborn heatmap", "representing the correlation matrix of the DataFrame. Assumes sublists of length 5.", "Also assumes DataFrame columns: 'A', 'B', 'C', 'D', 'E'."], "note": [], "params": ["array (list of list of int): 2D list with sublists of length 5. Must not be empty."], "returns": ["DataFrame: Constructed from the input 2D list.", "heatmap: Seaborn heatmap of the DataFrame's correlation matrix."], "reqs": ["pandas", "seaborn"], "raises": [], "example": [">>> df, ax = f_403([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])", ">>> df", "A B C D E", "0 1 2 3 4 5", "1 5 4 3 2 1", ">>> ax", ""]}} +{"task_id": "f_378", "prompt": "import re\nimport random\nimport pandas as pd\n\n\ndef f_378(data_list, seed=None):\n \"\"\"\n Shuffle the substrings within each string in a given list.\n\n This function takes a list of comma-separated strings and splits each into substrings.\n It extracts substrings based on commas, removing leading and trailing whitespaces\n from each. Then, it shuffles these processed substrings within each string, and\n returns a pandas DataFrame with two columns: \"Original String\" and \"Shuffled String\".\n\n Parameters:\n data_list (list): The list of comma-separated strings.\n seed (int, optional): Seed for the random number generator. Default is None.\n\n Returns:\n DataFrame: A pandas DataFrame with columns 'Original String' and 'Shuffled String'.\n\n Requirements:\n - pandas\n - random\n - re\n\n Example:\n >>> f_378(['lamp, bag, mirror', 'table, chair'], seed=42)\n Original String Shuffled String\n 0 lamp, bag, mirror bag, lamp, mirror\n 1 table, chair chair, table\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n df = pd.DataFrame(data_list, columns=[\"Original String\"])\n\n shuffled_strings = []\n for s in data_list:\n substrings = re.split(\"\\s*,\\s*\", s)\n random.shuffle(substrings)\n shuffled_s = \", \".join(substrings)\n shuffled_strings.append(shuffled_s)\n\n df[\"Shuffled String\"] = shuffled_strings\n\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n input_data = [\"lamp, bag, mirror\", \"table, chair\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"lamp, bag, mirror\")\n self.assertEqual(output_df[\"Original String\"].iloc[1], \"table, chair\")\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[0].split(\", \")), 3)\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[1].split(\", \")), 2)\n def test_case_2(self):\n # Test single character substrings\n input_data = [\"A, B, C, D\", \"E, F, G\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"A, B, C, D\")\n self.assertEqual(output_df[\"Original String\"].iloc[1], \"E, F, G\")\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[0].split(\", \")), 4)\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[1].split(\", \")), 3)\n def test_case_3(self):\n # Test single-item list\n input_data = [\"word1, word2\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"word1, word2\")\n self.assertEqual(len(output_df[\"Shuffled String\"].iloc[0].split(\", \")), 2)\n def test_case_4(self):\n # Tests shuffling with an empty string\n input_data = [\"\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"\")\n self.assertEqual(output_df[\"Shuffled String\"].iloc[0], \"\")\n def test_case_5(self):\n # Test shuffling single substring (no shuffling)\n input_data = [\"single\"]\n output_df = f_378(input_data)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"single\")\n self.assertEqual(output_df[\"Shuffled String\"].iloc[0], \"single\")\n def test_case_6(self):\n # Testing the effect of a specific random seed to ensure reproducibility\n input_data = [\"a, b, c, d\"]\n output_df1 = f_378(input_data, seed=42)\n output_df2 = f_378(input_data, seed=42)\n self.assertEqual(\n output_df1[\"Shuffled String\"].iloc[0], output_df2[\"Shuffled String\"].iloc[0]\n )\n def test_case_7(self):\n # Tests shuffling with varying spaces around commas\n input_data = [\"one,two, three\"]\n corrected_expected_shuffled = \"two, one, three\"\n output_df = f_378(input_data, seed=42)\n self.assertEqual(output_df[\"Original String\"].iloc[0], \"one,two, three\")\n self.assertEqual(\n output_df[\"Shuffled String\"].iloc[0], corrected_expected_shuffled\n )", "apis": ["pandas.DataFrame", "re.split", "random.seed", "random.shuffle"], "libs": ["random", "re", "pandas"], "doc": {"description": ["Shuffle the substrings within each string in a given list.", "This function takes a list of comma-separated strings and splits each into substrings.", "It extracts substrings based on commas, removing leading and trailing whitespaces", "from each. Then, it shuffles these processed substrings within each string, and", "returns a pandas DataFrame with two columns: \"Original String\" and \"Shuffled String\"."], "note": [], "params": ["data_list (list): The list of comma-separated strings.", "seed (int, optional): Seed for the random number generator. Default is None."], "returns": ["DataFrame: A pandas DataFrame with columns 'Original String' and 'Shuffled String'."], "reqs": ["pandas", "random", "re"], "raises": [], "example": [">>> f_378(['lamp, bag, mirror', 'table, chair'], seed=42)", "Original String Shuffled String", "0 lamp, bag, mirror bag, lamp, mirror", "1 table, chair chair, table"]}} {"task_id": "f_349", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.mplot3d import Axes3D\n\n\ndef f_349(n_points=100, random_seed=None):\n \"\"\"\n Generate an array of random 3D dots in the range [0, 1) for each dimension\n and draw them in a 3D scatter plot.\n\n Parameters:\n n_points (int): The number of points to generate and plot. Default is 100.\n random_seed (int, optional): Seed for the random number generator. Default is None.\n\n Returns:\n tuple: A tuple containing:\n - points (ndarray): A numpy ndarray of shape (n_points, 3) with the coordinates of the points.\n - plot (Axes3D): A 3D scatter plot of the generated points.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> points, plot = f_349(200, random_seed=42)\n >>> type(points)\n \n >>> type(plot)\n \n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n points = np.random.random((n_points, 3))\n\n fig = plt.figure()\n ax = fig.add_subplot(111, projection=\"3d\")\n ax.scatter(points[:, 0], points[:, 1], points[:, 2])\n\n return points, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.mplot3d import Axes3D\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test default parameters - values\n points, _ = f_349()\n self.assertEqual(points.shape, (100, 3))\n self.assertTrue(\n (points >= 0).all() and (points < 1).all(),\n \"All points should be in the range [0, 1)\",\n )\n def test_case_2(self):\n # Test default parameters - plot\n _, plot = f_349()\n self.assertTrue(isinstance(plot, Axes3D))\n def test_case_3(self):\n # Test controlling number of points\n points1, _ = f_349(n_points=1)\n points10, _ = f_349(n_points=10)\n points100, _ = f_349(n_points=100)\n self.assertEqual(points1.shape, (1, 3))\n self.assertEqual(points10.shape, (10, 3))\n self.assertEqual(points100.shape, (100, 3))\n def test_case_4(self):\n # Test random seed\n points1, _ = f_349(random_seed=42)\n points2, _ = f_349(random_seed=42)\n self.assertTrue(\n np.array_equal(points1, points2),\n \"The points should be identical for the same seed\",\n )\n def test_case_5(self):\n # Test handling invalid inputs\n with self.assertRaises(ValueError):\n f_349(-1)\n for invalid in [0.5, \"invalid\", None, []]:\n with self.assertRaises(TypeError):\n f_349(invalid)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.random", "matplotlib.pyplot.figure", "numpy.random.seed", "numpy.random.random"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Generate an array of random 3D dots in the range [0, 1) for each dimension", "and draw them in a 3D scatter plot."], "note": [], "params": ["n_points (int): The number of points to generate and plot. Default is 100.", "random_seed (int, optional): Seed for the random number generator. Default is None."], "returns": ["tuple: A tuple containing:", "points (ndarray): A numpy ndarray of shape (n_points, 3) with the coordinates of the points.", "plot (Axes3D): A 3D scatter plot of the generated points."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> points, plot = f_349(200, random_seed=42)", ">>> type(points)", "", ">>> type(plot)", ""]}} -{"task_id": "f_377", "prompt": "import random\nimport string\nimport pandas as pd\n\n\ndef f_377(data_list, seed=0):\n \"\"\"\n Replace a random substring (a sequence of characters between two commas or at the beginning/end of the string)\n in a list of strings with a random string (comprising ascii lowercase characters) with the same length as\n the substituted characters.\n\n Parameters:\n data_list (list): Input list of strings.\n Within each string, each substring's leading and trailing whitespaces are removed.\n If empty, it will return a DataFrame with the Original String and Modified String\n columns that is otherwise empty.\n seed (int, optional): The seed for random operations to ensure reproducibility. Defaults to 0.\n\n Returns:\n DataFrame: A pandas DataFrame with two columns - 'Original String' and 'Modified String'.\n 'Original String' contains the original strings from the input list, and 'Modified String'\n contains the modified strings where a random substring has been replaced.\n\n Requirements:\n - pandas\n - random\n - string\n\n Example:\n >>> f_377(['lamp, bag, mirror', 'table, chair, bag, lamp'])\n Original String Modified String\n 0 lamp, bag, mirror lamp, tkg, mirror\n 1 table, chair, bag, lamp table, chair, bag, kuhm\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n df = pd.DataFrame(data_list, columns=[\"Original String\"])\n\n modified_strings = []\n for s in data_list:\n s = s.strip()\n if not s:\n modified_strings.append(s)\n continue\n substrings = [ss.strip() for ss in s.split(\",\")]\n replace_idx = random.randint(0, len(substrings) - 1)\n random_string = \"\".join(\n random.choices(string.ascii_lowercase, k=len(substrings[replace_idx]))\n )\n substrings[replace_idx] = random_string\n modified_string = \", \".join(substrings)\n modified_strings.append(modified_string)\n\n df[\"Modified String\"] = modified_strings\n\n return df", "test": "import unittest\nimport random\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with a typical input list\n input_data = [\"lamp, bag, mirror\", \"table, chair, bag, lamp\"]\n result = f_377(input_data, seed=0)\n self.assertTrue(all(item in input_data for item in result[\"Original String\"]))\n self.assertNotEqual(\n result[\"Original String\"].tolist(), result[\"Modified String\"].tolist()\n )\n def test_case_2(self):\n # Test with a single-item list\n input_data = [\"lamp, bag, mirror\"]\n result = f_377(input_data, seed=0)\n self.assertTrue(all(item in input_data for item in result[\"Original String\"]))\n self.assertNotEqual(\n result[\"Original String\"].tolist(), result[\"Modified String\"].tolist()\n )\n def test_case_3(self):\n # Test with a list of varied length strings\n input_data = [\"lamp, chair\", \"table, mirror, bag\", \"desk, bed\"]\n result = f_377(input_data, seed=0)\n self.assertTrue(all(item in input_data for item in result[\"Original String\"]))\n self.assertNotEqual(\n result[\"Original String\"].tolist(), result[\"Modified String\"].tolist()\n )\n def test_case_4(self):\n # Test with an empty list\n input_data = []\n result = f_377(input_data, seed=0)\n self.assertEqual(len(result), 0)\n def test_case_5(self):\n # Test with a list of empty strings\n input_data = [\"\", \"\", \"\"]\n result = f_377(input_data, seed=0)\n self.assertEqual(result[\"Original String\"].tolist(), [\"\", \"\", \"\"])\n self.assertEqual(result[\"Modified String\"].tolist(), [\"\", \"\", \"\"])\n def test_case_6(self):\n # Test with strings that have no commas\n input_data = [\"lamps\", \"table\"]\n result = f_377(input_data, seed=1)\n self.assertTrue(\n all(len(modified) == 5 for modified in result[\"Modified String\"])\n )\n def test_case_7(self):\n # Test with strings that contain multiple identical substrings\n input_data = [\"lamp, lamp, lamp\"]\n result = f_377(input_data, seed=2)\n self.assertNotEqual(result[\"Original String\"][0], result[\"Modified String\"][0])\n self.assertTrue(\n any(sub != \"lamp\" for sub in result[\"Modified String\"][0].split(\", \"))\n )\n def test_case_8(self):\n # Test with mixed case input strings\n input_data = [\"Lamp, Bag, Mirror\"]\n result = f_377(input_data, seed=4)\n self.assertNotEqual(\n result[\"Original String\"].tolist(), result[\"Modified String\"].tolist()\n )\n self.assertTrue(\n any(char.islower() for char in result[\"Modified String\"][0])\n ) # Ensure replacement is in lowercase\n def test_case_9(self):\n # Test effect of different seeds on output\n input_data = [\"lamp, bag, mirror\"]\n result_seed_0a = f_377(input_data, seed=0)\n result_seed_0b = f_377(input_data, seed=0)\n result_seed_5 = f_377(input_data, seed=5)\n self.assertEqual(\n result_seed_0a[\"Modified String\"][0], result_seed_0b[\"Modified String\"][0]\n )\n self.assertNotEqual(\n result_seed_0a[\"Modified String\"][0], result_seed_5[\"Modified String\"][0]\n )\n def test_case_10(self):\n # Test case sensitivity\n input_data = [\"Lamp, Bag, Mirror\"]\n result = f_377(input_data, seed=3)\n original_items = [\n item.lower() for item in result[\"Original String\"][0].split(\", \")\n ]\n modified_items = [item for item in result[\"Modified String\"][0].split(\", \")]\n self.assertTrue(\n any(mod_item not in original_items for mod_item in modified_items),\n \"Modified string should contain a lowercase random replacement not present in the original string\",\n )\n def test_case_11(self):\n # Test whitespaces (i.e. make sure leading/trailing whitespaces are removed in processing substrings)\n input_data = [\" lamp, bag ,mirror \"]\n result = f_377(input_data, seed=3)\n modified = result[\"Modified String\"][0].split(\", \")\n self.assertTrue(\n all(item.strip() == item for item in modified),\n \"All items in the modified string should have leading and trailing whitespaces removed\",\n )", "apis": ["string.ascii_lowercase", "random.randint", "random.seed", "random.choices", "pandas.DataFrame"], "libs": ["string", "pandas", "random"], "doc": {"description": ["Replace a random substring (a sequence of characters between two commas or at the beginning/end of the string)", "in a list of strings with a random string (comprising ascii lowercase characters) with the same length as", "the substituted characters."], "note": [], "params": ["data_list (list): Input list of strings.", "Within each string, each substring's leading and trailing whitespaces are removed.", "If empty, it will return a DataFrame with the Original String and Modified String", "columns that is otherwise empty.", "seed (int, optional): The seed for random operations to ensure reproducibility. Defaults to 0."], "returns": ["DataFrame: A pandas DataFrame with two columns - 'Original String' and 'Modified String'.", "'Original String' contains the original strings from the input list, and 'Modified String'", "contains the modified strings where a random substring has been replaced."], "reqs": ["pandas", "random", "string"], "raises": [], "example": [">>> f_377(['lamp, bag, mirror', 'table, chair, bag, lamp'])", "Original String Modified String", "0 lamp, bag, mirror lamp, tkg, mirror", "1 table, chair, bag, lamp table, chair, bag, kuhm"]}} -{"task_id": "f_761", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_761(df, column):\n \"\"\"\n Draw and return a bar chart that shows the distribution of categories in a specific column of a DataFrame.\n \n Note:\n The categories are defined by the constant CATEGORIES, \n which is a list containing ['A', 'B', 'C', 'D', 'E']. If some categories are missing in the DataFrame, \n they will be included in the plot with a count of zero.\n The x label of the plot is set to 'Category', the y label is set to 'Count', and the title is set to 'Distribution of {column}'.\n \n Parameters:\n - df (pandas.DataFrame): The DataFrame to be processed.\n - column (str): The name of the column in the DataFrame that contains the categories.\n \n Output:\n - matplotlib.axes._subplots.AxesSubplot: The Axes object for the generated plot.\n \n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> df = pd.DataFrame({'Category': ['A', 'B', 'B', 'C', 'A', 'D', 'E', 'E', 'D']})\n >>> ax = f_761(df, 'Category')\n # This generates and displays a bar chart showing the distribution of each category within the 'Category' column.\n \n >>> df = pd.DataFrame({'Type': ['A', 'A', 'C', 'E', 'D', 'E', 'D']})\n >>> ax = f_761(df, 'Type')\n # This generates and displays a bar chart showing the distribution of each category within the 'Type' column, including categories with zero occurrences.\n \"\"\"", "canonical_solution": " # Define the categories\n CATEGORIES = ['A', 'B', 'C', 'D', 'E']\n \n # Count occurrences of each category\n counts = df[column].value_counts()\n missing_categories = list(set(CATEGORIES) - set(counts.index))\n for category in missing_categories:\n counts[category] = 0\n\n counts = counts.reindex(CATEGORIES)\n \n # Plotting\n ax = counts.plot(kind='bar')\n ax.set_xlabel('Category')\n ax.set_ylabel('Count')\n ax.set_title(f'Distribution of {column}')\n plt.show()\n \n return ax", "test": "import unittest\nimport pandas as pd\nfrom matplotlib import pyplot as plt\nclass TestCases(unittest.TestCase):\n \n def test_with_all_categories(self):\n \"\"\"Test with all categories present.\"\"\"\n df = pd.DataFrame({'Category': ['A', 'B', 'B', 'C', 'A', 'D', 'E', 'E', 'D']})\n ax = f_761(df, 'Category')\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_xlabel(), 'Category')\n self.assertEqual(ax.get_ylabel(), 'Count')\n self.assertEqual(ax.get_title(), 'Distribution of Category')\n self.assertEqual(len(ax.get_xticks()), 5) # Check the number of x-axis ticks instead\n def test_with_missing_categories(self):\n \"\"\"Test with some categories missing.\"\"\"\n df = pd.DataFrame({'Category': ['A', 'A', 'B', 'C']})\n ax = f_761(df, 'Category')\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.get_xticks()), 5) # Ensure all categories are accounted for, including missing ones\n def test_with_unexpected_category(self):\n \"\"\"Test with a category not in predefined list.\"\"\"\n df = pd.DataFrame({'Category': ['F', 'A', 'B']}) # 'F' is not a predefined category\n ax = f_761(df, 'Category')\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.get_xticks()), 5) # 'F' is ignored, only predefined categories are considered", "apis": ["matplotlib.pyplot.show"], "libs": ["matplotlib"], "doc": {"description": ["Draw and return a bar chart that shows the distribution of categories in a specific column of a DataFrame.", "Output:", "- matplotlib.axes._subplots.AxesSubplot: The Axes object for the generated plot.", ">>> df = pd.DataFrame({'Type': ['A', 'A', 'C', 'E', 'D', 'E', 'D']})", ">>> ax = f_761(df, 'Type')", "# This generates and displays a bar chart showing the distribution of each category within the 'Type' column, including categories with zero occurrences."], "note": ["The categories are defined by the constant CATEGORIES,", "which is a list containing ['A', 'B', 'C', 'D', 'E']. If some categories are missing in the DataFrame,", "they will be included in the plot with a count of zero.", "The x label of the plot is set to 'Category', the y label is set to 'Count', and the title is set to 'Distribution of {column}'."], "params": ["df (pandas.DataFrame): The DataFrame to be processed.", "column (str): The name of the column in the DataFrame that contains the categories."], "returns": [], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({'Category': ['A', 'B', 'B', 'C', 'A', 'D', 'E', 'E', 'D']})", ">>> ax = f_761(df, 'Category')", "# This generates and displays a bar chart showing the distribution of each category within the 'Category' column."]}} -{"task_id": "f_874", "prompt": "import random\nimport string\nimport pandas as pd\n\n\ndef f_874(n_rows=1000):\n \"\"\"\n Generate a histogram of the frequency of the top 30 unique random 3-letter strings.\n The function creates random strings, each consisting of 3 letters from the lowercase English alphabet.\n It then plots a histogram showing the frequencies of the top 30 most common strings among the generated set.\n\n Parameters:\n - n_rows (int): Number of random 3-letter strings to generate.\n Must be positive. Default is 1000.\n\n Returns:\n - ax (matplotlib.axes.Axes): A Matplotlib Axes object containing the histogram.\n Each bar represents one of the top 30 most frequent 3-letter strings.\n\n Raises:\n - ValueError: If `n_rows` is less than or equal to 0.\n\n Requirements:\n - random\n - string\n - pandas\n \n Example:\n >>> ax = f_874(1000)\n >>> ax.get_title()\n 'Top 30 Frequencies of Random 3-Letter Strings'\n \"\"\"", "canonical_solution": " # Check if n_rows is positive\n if n_rows <= 0:\n raise ValueError(\"Number of rows must be greater than 0\")\n\n # Generate random strings\n data = [\"\".join(random.choices(string.ascii_lowercase, k=3)) for _ in range(n_rows)]\n df = pd.DataFrame(data, columns=[\"String\"])\n\n # Aggregate and plot the data\n frequency = df[\"String\"].value_counts()\n ax = frequency.head(30).plot(\n kind=\"bar\"\n ) # Limit to the top 30 frequencies for readability\n ax.set_title(\"Top 30 Frequencies of Random 3-Letter Strings\")\n ax.set_xlabel(\"String\")\n ax.set_ylabel(\"Frequency\")\n\n return ax", "test": "import unittest\nimport random\nfrom matplotlib.axes import Axes\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_874.\"\"\"\n def test_return_type(self):\n \"\"\"Test if the function returns a Matplotlib Axes object.\"\"\"\n random.seed(0)\n result = f_874(100)\n self.assertIsInstance(result, Axes)\n def test_default_parameter(self):\n \"\"\"Test the function with the default parameter.\"\"\"\n result = f_874()\n self.assertIsInstance(result, Axes)\n def test_zero_rows(self):\n \"\"\"Test the function with zero rows.\"\"\"\n with self.assertRaises(ValueError):\n f_874(0)\n def test_negative_rows(self):\n \"\"\"Test the function with a negative number of rows.\"\"\"\n with self.assertRaises(ValueError):\n f_874(-1)\n def test_large_number_of_rows(self):\n \"\"\"Test the function with a large number of rows.\"\"\"\n random.seed(2)\n result = f_874(10000)\n self.assertIsInstance(result, Axes)\n def tearDown(self):\n plt.close()", "apis": ["string.ascii_lowercase", "pandas.DataFrame", "random.choices"], "libs": ["string", "pandas", "random"], "doc": {"description": ["Generate a histogram of the frequency of the top 30 unique random 3-letter strings.", "The function creates random strings, each consisting of 3 letters from the lowercase English alphabet.", "It then plots a histogram showing the frequencies of the top 30 most common strings among the generated set."], "note": [], "params": ["n_rows (int): Number of random 3-letter strings to generate.", "Must be positive. Default is 1000."], "returns": ["ax (matplotlib.axes.Axes): A Matplotlib Axes object containing the histogram.", "Each bar represents one of the top 30 most frequent 3-letter strings."], "reqs": ["random", "string", "pandas"], "raises": ["ValueError: If `n_rows` is less than or equal to 0."], "example": [">>> ax = f_874(1000)", ">>> ax.get_title()", "'Top 30 Frequencies of Random 3-Letter Strings'"]}} -{"task_id": "f_886", "prompt": "import smtplib\nfrom email.message import EmailMessage\nimport getpass\n\nSERVER_ADDRESS = \"localhost\"\nSERVER_PORT = 25\nBUFFER_SIZE = 1024\nSMTP_SERVER = \"smtp.gmail.com\"\nSMTP_PORT = 587\n\n\ndef f_886(client_socket):\n \"\"\"\n Receive a message from a client socket and send it as an email via an SMTP server.\n\n Parameters:\n client_socket (socket.socket): The client socket from which the message is received.\n\n Returns:\n - None\n\n Note:\n - Requires a working internet connection and access to an SMTP server.\n - The function asks for the sender's email, recipient's email,\n and sender's email password for authentication.\n\n Requirements:\n - smtplib\n - email.message.EmailMessage\n - getpass\n\n Example:\n >>> import socket\n >>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n >>> server_socket.bind((SERVER_ADDRESS, SERVER_PORT))\n >>> server_socket.listen(5)\n >>> client_socket, addr = server_socket.accept()\n >>> f_886(client_socket)\n \"\"\"", "canonical_solution": " request = client_socket.recv(BUFFER_SIZE).decode(\"utf-8\")\n print(f\"Received: {request}\")\n\n email = EmailMessage()\n email[\"From\"] = getpass.getpass(\"Email: \")\n email[\"To\"] = getpass.getpass(\"Recipient: \")\n email[\"Subject\"] = \"Message from socket client\"\n email.set_content(request)\n\n with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as smtp:\n smtp.starttls()\n smtp.login(email[\"From\"], getpass.getpass(\"Password: \"))\n smtp.send_message(email)\n\n response = \"Message sent.\"\n client_socket.send(response.encode(\"utf-8\"))\n client_socket.close()", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport smtplib\nfrom email.message import EmailMessage\nimport getpass\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_886\"\"\"\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_successful_email_send(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test if the email is successfully sent with valid inputs.\n \"\"\"\n # Mock behaviors\n mock_socket.return_value.recv.return_value = b\"Test message\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n # Call the function\n f_886(mock_socket())\n # Assertions\n mock_smtp.assert_called_with(\"smtp.gmail.com\", 587)\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_email_with_empty_message(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test behavior when an empty message is received.\n \"\"\"\n # Mock the recv method to return an empty byte string\n mock_socket.return_value.recv.return_value = b\"\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n mock_smtp_instance = MagicMock()\n mock_smtp.return_value = mock_smtp_instance\n client_socket = MagicMock()\n # Simulate the recv and decode behavior by setting the return value of the decode method\n client_socket.recv.return_value.decode.return_value = \"\"\n f_886(client_socket)\n mock_smtp_instance.send_message.assert_not_called()\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_smtp_server_connection_error(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test behavior when there is a network error (e.g., SMTP server unreachable).\n \"\"\"\n # Setup mock for recv to return a valid bytes object\n client_socket = MagicMock()\n client_socket.recv.return_value = b\"Test message\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n mock_smtp.side_effect = smtplib.SMTPConnectError(\n 421, \"Failed to connect to the server\"\n )\n # Expecting an SMTPConnectError\n with self.assertRaises(smtplib.SMTPConnectError):\n f_886(client_socket)\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_socket_closes_after_operation(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test if the socket is properly closed after the operation.\n \"\"\"\n # Setup mock for recv to return a valid bytes object\n client_socket = MagicMock()\n client_socket.recv.return_value = b\"Test message\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n f_886(client_socket)\n # Assert that the socket's close method was called\n client_socket.close.assert_called_once()\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_successful_email_dispatch(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test if the email is successfully composed and sent with valid inputs.\n \"\"\"\n client_socket = MagicMock()\n client_socket.recv.return_value = b\"Hello, this is a test message.\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n mock_smtp_instance = MagicMock()\n mock_smtp.return_value = mock_smtp_instance\n f_886(client_socket)\n # Assert that the SMTP instance was created\n mock_smtp.assert_called_with(\"smtp.gmail.com\", 587)\n success_response = \"Message sent.\"\n client_socket.send.assert_called_with(success_response.encode(\"utf-8\"))\n client_socket.close.assert_called_once()", "apis": ["email.message.EmailMessage", "getpass.getpass", "smtplib.SMTP"], "libs": ["email", "getpass", "smtplib"], "doc": {"description": ["Receive a message from a client socket and send it as an email via an SMTP server."], "note": ["Requires a working internet connection and access to an SMTP server.", "The function asks for the sender's email, recipient's email,", "and sender's email password for authentication."], "params": ["client_socket (socket.socket): The client socket from which the message is received."], "returns": ["None"], "reqs": ["smtplib", "email.message.EmailMessage", "getpass"], "raises": [], "example": [">>> import socket", ">>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)", ">>> server_socket.bind((SERVER_ADDRESS, SERVER_PORT))", ">>> server_socket.listen(5)", ">>> client_socket, addr = server_socket.accept()", ">>> f_886(client_socket)"]}} -{"task_id": "f_736", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n# Constants\nARRAY_SIZE = 10000\n\ndef f_736():\n \"\"\"\n Create a numeric array of random integers, calculate the mean and standard deviation, and draw a histogram of the distribution.\n\n Note:\n The random integers are generated between 1 and 100. The title of the histogram is \"Histogram of Random Integers\". \n The x-axis is labeled \"Value\" and the y-axis is labeled \"Frequency\". \n The mean is plotted as a red dashed line, and the standard deviation is plotted as purple dashed lines.\n \n Returns:\n Tuple: A tuple containing the array, mean, standard deviation, and the histogram plot (Axes).\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> import numpy as np\n >>> np.random.seed(0)\n >>> array, mean, std, ax = f_736()\n >>> print(mean, std)\n 49.6135 28.5323416100046\n >>> plt.show()\n \"\"\"", "canonical_solution": " array = np.random.randint(1, 100, size=ARRAY_SIZE)\n mean = np.mean(array)\n std = np.std(array)\n\n fig, ax = plt.subplots()\n ax.hist(array, bins='auto')\n ax.set_title('Histogram of Random Integers')\n ax.set_xlabel('Value')\n ax.set_ylabel('Frequency')\n ax.axvline(mean, color='red', linestyle='dashed', linewidth=1)\n ax.axvline(mean + std, color='purple', linestyle='dashed', linewidth=1)\n ax.axvline(mean - std, color='purple', linestyle='dashed', linewidth=1)\n ax.legend([\"Mean\", \"Standard Deviation\"])\n plt.show()\n \n return array, mean, std, ax", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n np.random.seed(0)\n array, mean, std, ax = f_736()\n self.assertEqual(array.size, ARRAY_SIZE)\n self.assertEqual(mean, 49.6135)\n self.assertEqual(std, 28.5323416100046)\n self.assertEqual(ax.get_title(), 'Histogram of Random Integers')\n def test_case_2(self):\n array, mean, std, ax = f_736()\n self.assertEqual(ax.get_xlabel(), 'Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n def test_case_3(self):\n np.random.seed(1)\n array, mean, std, ax = f_736()\n self.assertEqual(mean, 50.0717)\n self.assertEqual(std, 28.559862729186918)\n def test_case_4(self):\n np.random.seed(100)\n array, mean, std, ax = f_736()\n self.assertEqual(mean, 50.2223)\n self.assertEqual(std, 28.494467580742757)\n def test_case_5(self):\n np.random.seed(500)\n array, mean, std, ax = f_736()\n self.assertEqual(mean, 49.8636)\n self.assertEqual(std, 28.516030492338864)", "apis": ["numpy.random.randint", "numpy.std", "numpy.mean", "numpy.random", "matplotlib.pyplot.show", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Create a numeric array of random integers, calculate the mean and standard deviation, and draw a histogram of the distribution."], "note": ["The random integers are generated between 1 and 100. The title of the histogram is \"Histogram of Random Integers\".", "The x-axis is labeled \"Value\" and the y-axis is labeled \"Frequency\".", "The mean is plotted as a red dashed line, and the standard deviation is plotted as purple dashed lines."], "params": [], "returns": ["Tuple: A tuple containing the array, mean, standard deviation, and the histogram plot (Axes)."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> import numpy as np", ">>> np.random.seed(0)", ">>> array, mean, std, ax = f_736()", ">>> print(mean, std)", "49.6135 28.5323416100046", ">>> plt.show()"]}} -{"task_id": "f_866", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_866(dataframe):\n \"\"\"\n Calculate the correlation matrix of a DataFrame and plot a scatter plot for the pair of columns with the highest absolute correlation.\n\n Parameters:\n - dataframe (pd.DataFrame): The DataFrame containing numeric columns for correlation calculation.\n\n Returns:\n - ax (plt.Axes): The scatter plot of the pair of columns with the highest absolute correlation.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib\n\n Exception Handling:\n - Raises ValueError if the input DataFrame is empty.\n - Raises TypeError if any column in the DataFrame is non-numeric.\n - Raises ValueError if the DataFrame has fewer than two columns.\n\n Example:\n >>> df = pd.DataFrame({\n ... 'A': np.random.rand(100),\n ... 'B': np.random.rand(100),\n ... 'C': np.random.rand(100)\n ... })\n >>> ax = f_866(df)\n >>> print(ax)\n Axes(0.125,0.11;0.775x0.77)\n \"\"\"", "canonical_solution": "\n if dataframe.empty:\n raise ValueError(\"DataFrame is empty.\")\n \n if not all(dataframe.dtypes.apply(lambda x: np.issubdtype(x, np.number))):\n raise TypeError(\"All columns must be numeric for correlation calculation.\")\n\n if dataframe.shape[1] < 2:\n raise ValueError(\"DataFrame must have at least two columns for correlation calculation.\")\n\n # Explicit use of pd.DataFrame.corr() to calculate the correlation matrix\n corr_matrix = pd.DataFrame.corr(dataframe)\n abs_corr_matrix = corr_matrix.abs()\n\n # Finding the pair of columns with the highest absolute correlation\n highest_corr_value = abs_corr_matrix.unstack().dropna().nlargest(2).iloc[-1]\n max_corr_pair = np.where(abs_corr_matrix == highest_corr_value)\n\n # Extracting column names for the highest correlation\n column_x = dataframe.columns[max_corr_pair[0][0]]\n column_y = dataframe.columns[max_corr_pair[1][0]]\n\n # Using plt to plot the scatter plot\n plt.figure(figsize=(10, 6)) # Creating a figure\n plt.scatter(dataframe[column_x], dataframe[column_y]) # Plotting the scatter plot\n plt.title(f\"Scatter plot between {column_x} and {column_y}\") # Setting the title\n plt.xlabel(column_x) # Setting the x-axis label\n plt.ylabel(column_y) # Setting the y-axis label\n plt.show() # Displaying the figure\n\n return plt.gca() # Returning the current Axes object for further use", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_866.\"\"\"\n def test_high_correlation(self):\n \"\"\"\n Test if the function correctly identifies and plots the pair of columns with the highest positive correlation.\n \"\"\"\n np.random.seed(0) # Set a fixed seed for reproducibility\n df = pd.DataFrame(\n {\"A\": np.arange(100), \"B\": np.arange(100) * 2, \"C\": np.random.rand(100)}\n )\n ax = f_866(df)\n corr = df.corr()\n abs_corr = corr.abs()\n max_corr = abs_corr.unstack().dropna().nlargest(3).iloc[-1]\n expected_pair = np.where(abs_corr == max_corr)\n expected_labels = (\n df.columns[expected_pair[0][0]],\n df.columns[expected_pair[1][0]],\n )\n self.assertEqual((ax.get_xlabel(), ax.get_ylabel()), expected_labels)\n def test_no_correlation(self):\n \"\"\"\n Test if the function handles a case where there is no significant correlation between columns.\n \"\"\"\n np.random.seed(1)\n df = pd.DataFrame(\n {\n \"A\": np.random.rand(100),\n \"B\": np.random.rand(100),\n \"C\": np.random.rand(100),\n }\n )\n ax = f_866(df)\n self.assertIsInstance(ax, plt.Axes)\n def test_negative_correlation(self):\n \"\"\"\n Test if the function correctly identifies and plots the pair of columns with the highest absolute correlation,\n including negative correlations.\n \"\"\"\n np.random.seed(2)\n df = pd.DataFrame(\n {\"A\": np.arange(100), \"B\": np.random.rand(100), \"C\": -np.arange(100) + 50}\n )\n ax = f_866(df)\n corr = df.corr()\n # Get the pair with the highest absolute correlation excluding self-correlations\n abs_corr = corr.abs()\n max_corr = abs_corr.unstack().dropna().nlargest(3).iloc[-1]\n expected_pair = np.where(abs_corr == max_corr)\n expected_labels = (\n df.columns[expected_pair[0][0]],\n df.columns[expected_pair[1][0]],\n )\n self.assertEqual((ax.get_xlabel(), ax.get_ylabel()), expected_labels)\n def test_single_column(self):\n \"\"\"\n Test if the function raises a ValueError when provided with a DataFrame containing only one column.\n \"\"\"\n np.random.seed(3)\n df = pd.DataFrame({\"A\": np.random.rand(100)})\n with self.assertRaises(ValueError):\n f_866(df)\n def test_non_numeric_columns(self):\n \"\"\"\n Test if the function raises a TypeError when provided with a DataFrame containing non-numeric columns.\n \"\"\"\n np.random.seed(4)\n df = pd.DataFrame(\n {\"A\": np.random.rand(100), \"B\": [\"text\"] * 100, \"C\": np.random.rand(100)}\n )\n with self.assertRaises(TypeError):\n f_866(df)\n def test_empty_dataframe(self):\n \"\"\"\n Test if the function raises a ValueError when provided with an empty DataFrame.\n \"\"\"\n df = pd.DataFrame() # Create an empty DataFrame\n with self.assertRaises(ValueError):\n f_866(df)", "apis": ["pandas.DataFrame.corr", "matplotlib.pyplot.scatter", "matplotlib.pyplot.xlabel", "numpy.issubdtype", "matplotlib.pyplot.show", "matplotlib.pyplot.title", "numpy.number", "matplotlib.pyplot.figure", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "numpy.where", "pandas.DataFrame"], "libs": ["matplotlib", "pandas", "numpy"], "doc": {"description": ["Calculate the correlation matrix of a DataFrame and plot a scatter plot for the pair of columns with the highest absolute correlation.", "Exception Handling:", "- Raises ValueError if the input DataFrame is empty.", "- Raises TypeError if any column in the DataFrame is non-numeric.", "- Raises ValueError if the DataFrame has fewer than two columns."], "note": [], "params": ["dataframe (pd.DataFrame): The DataFrame containing numeric columns for correlation calculation."], "returns": ["ax (plt.Axes): The scatter plot of the pair of columns with the highest absolute correlation."], "reqs": ["pandas", "numpy", "matplotlib"], "raises": [], "example": [">>> df = pd.DataFrame({", "... 'A': np.random.rand(100),", "... 'B': np.random.rand(100),", "... 'C': np.random.rand(100)", "... })", ">>> ax = f_866(df)", ">>> print(ax)", "Axes(0.125,0.11;0.775x0.77)"]}} -{"task_id": "f_353", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import norm\n\ndef f_353(mu=0, sigma=1):\n \"\"\"\n Draw and return a plot of a normal distribution with the given mean and standard deviation,\n utilizing numpy's linspace to create an array of 100 linearly spaced numbers between\n `mu - 3*sigma` and `mu + 3*sigma`.\n\n Parameters:\n mu (float): The mean of the distribution. Default is 0.\n sigma (float): The standard deviation of the distribution. Default is 1.\n\n Returns:\n matplotlib.axes.Axes: The plot representing the normal distribution.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - scipy.stats.norm\n\n Example:\n >>> ax = f_353(mu=5, sigma=2)\n >>> ax\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)\n y = norm.pdf(x, mu, sigma)\n\n fig, ax = plt.subplots()\n ax.plot(x, y)\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test default parameters\n ax = f_353()\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertAlmostEqual(x[np.argmax(y)], 0, delta=0.1)\n self.assertTrue(min(x) >= -3 and max(x) <= 3)\n def test_case_2(self):\n # Test positive mu and sigma with manual calculation\n ax = f_353(mu=5, sigma=2)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n expected_min, expected_max = 5 - 3 * 2, 5 + 3 * 2\n self.assertAlmostEqual(min(x), expected_min, delta=0.1)\n self.assertAlmostEqual(max(x), expected_max, delta=0.1)\n def test_case_3(self):\n # Test negative mu and small sigma\n ax = f_353(mu=-3, sigma=0.5)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertAlmostEqual(x[np.argmax(y)], -3, delta=0.1)\n self.assertTrue(min(x) >= -3 - 1.5 and max(x) <= -3 + 1.5)\n def test_case_4(self):\n # Test large mu and sigma\n mu, sigma = 1e6, 1e5\n ax = f_353(mu=mu, sigma=sigma)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertTrue(\n len(x) > 0 and len(y) > 0,\n \"Plot data should not be empty even for large mu and sigma.\",\n )\n def test_case_5(self):\n # Test negative mu\n ax = f_353(mu=-5, sigma=4)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertAlmostEqual(x[np.argmax(y)], -5, delta=0.15)\n self.assertTrue(min(x) >= -5 - 12 and max(x) <= -5 + 12)\n def test_case_6(self):\n # Test the function with a sigma of 0, which might represent a degenerate distribution\n ax = f_353(mu=0, sigma=0)\n lines = ax.get_lines()\n self.assertEqual(\n len(lines),\n 1,\n \"Plot should contain exactly one line for a degenerate distribution.\",\n )\n def test_case_7(self):\n # Test the function with extremely large values of mu and sigma to ensure it doesn't break\n ax = f_353(mu=1e6, sigma=1e5)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertTrue(\n len(x) > 0 and len(y) > 0,\n \"Plot data should not be empty even for large mu and sigma.\",\n )\n def test_case_8(self):\n # Test the function with a very small positive sigma to check narrow distributions\n ax = f_353(mu=0, sigma=1e-5)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n # Checking that the plot peak is at mu and sigma affects the curve's spread.\n self.assertAlmostEqual(\n x[np.argmax(y)],\n 0,\n delta=1e-5,\n msg=\"Peak of the distribution should be at mu.\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["scipy.stats.norm.pdf", "numpy.linspace", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Draw and return a plot of a normal distribution with the given mean and standard deviation,", "utilizing numpy's linspace to create an array of 100 linearly spaced numbers between", "`mu - 3*sigma` and `mu + 3*sigma`."], "note": [], "params": ["mu (float): The mean of the distribution. Default is 0.", "sigma (float): The standard deviation of the distribution. Default is 1."], "returns": ["matplotlib.axes.Axes: The plot representing the normal distribution."], "reqs": ["numpy", "matplotlib.pyplot", "scipy.stats.norm"], "raises": [], "example": [">>> ax = f_353(mu=5, sigma=2)", ">>> ax", "", ">>> type(ax)", ""]}} -{"task_id": "f_851", "prompt": "import requests\nfrom bs4 import BeautifulSoup\nimport pandas as pd\nfrom io import StringIO\n\n\ndef f_851(url, table_id):\n \"\"\"\n Extracts and converts data from a specified HTML table based on the given 'table_id' on a webpage into a Pandas DataFrame.\n If the table is present but contains no data rows (i.e., no tags),\n the function returns an empty DataFrame.\n\n Parameters:\n - url (str): The URL of the webpage from which to extract the table.\n - table_id (str): The 'id' attribute of the HTML table to be extracted.\n\n Returns:\n - df (pd.DataFrame): A DataFrame containing the data extracted from the specified HTML table.\n If the table is found but has no rows ( elements), an empty DataFrame is returned.\n\n Raises:\n - requests.exceptions.HTTPError: If the HTTP request fails (e.g., due to connection issues or\n a non-successful status code like 404 or 500).\n - ValueError: If no table with the specified 'table_id' is found on the webpage. The error message will be\n \"Table with the specified ID not found.\"\n\n Requirements:\n - requests\n - bs4.BeautifulSoup\n - pandas\n - io\n \n Notes:\n - The function raises an HTTPError for unsuccessful HTTP requests, which includes scenarios like\n network problems or non-2xx HTTP responses.\n - A ValueError is raised specifically when the HTML table with the specified ID is not present\n in the webpage's content, indicating either an incorrect ID or the absence of the table.\n - If the located table has no rows, indicated by the absence of tags, an empty DataFrame is returned.\n This is useful for handling tables that are structurally present in the HTML but are devoid of data.\n\n Example:\n >>> f_851('https://example.com/data.html', 'table1')\n DataFrame:\n Name Age\n 0 Alice 25\n 1 Bob 30\n\n Example of ValueError:\n >>> f_851('https://example.com/data.html', 'nonexistent_table')\n ValueError: Table with the specified ID not found.\n\n Example of empty table:\n >>> f_851('https://example.com/emptytable.html', 'empty_table')\n DataFrame:\n Empty DataFrame\n Columns: []\n Index: []\n \"\"\"", "canonical_solution": " try:\n response = requests.get(url, timeout=5)\n response.raise_for_status() # Raises an HTTPError if the HTTP request returned an unsuccessful status code\n except requests.exceptions.HTTPError as e:\n raise e\n\n soup = BeautifulSoup(response.text, \"html.parser\")\n table = soup.find(\"table\", {\"id\": table_id})\n\n if table is None:\n raise ValueError(\"Table with the specified ID not found.\")\n\n # Check if the table is empty (no rows)\n if not table.find_all(\"tr\"):\n return pd.DataFrame()\n\n df = pd.read_html(StringIO(str(table)))[0]\n\n return df", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_851.\"\"\"\n @patch(\"requests.get\")\n def test_successful_scrape(self, mock_get):\n \"\"\"Test a successful scrape.\"\"\"\n mock_html_content = \"\"\"\n \n \n \n \n \n \n
NameAge
Alice25
Bob30
\n \n \n \"\"\"\n # Mock the response\n mock_response = MagicMock()\n mock_response.text = mock_html_content\n mock_get.return_value = mock_response\n # Test\n df = f_851(\"http://example.com\", \"table0\")\n self.assertIsInstance(df, pd.DataFrame)\n self.assertGreater(len(df), 0)\n self.assertIn(\"Name\", df.columns)\n self.assertIn(\"Age\", df.columns)\n @patch(\"requests.get\")\n def test_table_not_found(self, mock_get):\n \"\"\"Test table not found.\"\"\"\n mock_html_content = \"\"\n mock_response = MagicMock()\n mock_response.text = mock_html_content\n mock_get.return_value = mock_response\n # Test\n with self.assertRaises(ValueError):\n f_851(\"http://example.com\", \"non_existent_table\")\n @patch(\"requests.get\")\n def test_network_error(self, mock_get):\n \"\"\"Test network error.\"\"\"\n mock_get.side_effect = requests.exceptions.ConnectionError\n with self.assertRaises(requests.exceptions.ConnectionError):\n f_851(\"http://example.com\", \"table0\")\n @patch(\"requests.get\")\n def test_http_error(self, mock_get):\n \"\"\"Test HTTP error.\"\"\"\n mock_get.return_value.raise_for_status.side_effect = (\n requests.exceptions.HTTPError\n )\n # Test\n with self.assertRaises(requests.exceptions.HTTPError):\n f_851(\"http://example.com\", \"table0\")\n @patch(\"requests.get\")\n def test_empty_table(self, mock_get):\n # Mock HTML content with an empty table\n mock_html_content = \"\"\"\n \n \n
\n \n \n \"\"\"\n # Mock the response\n mock_response = MagicMock()\n mock_response.text = mock_html_content\n mock_get.return_value = mock_response\n # Test\n df = f_851(\"http://example.com\", \"table0\")\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(len(df), 0)", "apis": ["bs4.BeautifulSoup", "pandas.read_html", "requests.get", "io.StringIO", "requests.exceptions", "pandas.DataFrame"], "libs": ["bs4", "pandas", "requests", "io"], "doc": {"description": ["Extracts and converts data from a specified HTML table based on the given 'table_id' on a webpage into a Pandas DataFrame.", "If the table is present but contains no data rows (i.e., no tags),", "the function returns an empty DataFrame.", "Notes:", "- The function raises an HTTPError for unsuccessful HTTP requests, which includes scenarios like", "network problems or non-2xx HTTP responses.", "- A ValueError is raised specifically when the HTML table with the specified ID is not present", "in the webpage's content, indicating either an incorrect ID or the absence of the table.", "- If the located table has no rows, indicated by the absence of tags, an empty DataFrame is returned.", "This is useful for handling tables that are structurally present in the HTML but are devoid of data.", "Example of ValueError:", ">>> f_851('https://example.com/data.html', 'nonexistent_table')", "ValueError: Table with the specified ID not found.", "Example of empty table:", ">>> f_851('https://example.com/emptytable.html', 'empty_table')", "DataFrame:", "Empty DataFrame", "Columns: []", "Index: []"], "note": [], "params": ["url (str): The URL of the webpage from which to extract the table.", "table_id (str): The 'id' attribute of the HTML table to be extracted."], "returns": ["df (pd.DataFrame): A DataFrame containing the data extracted from the specified HTML table.", "If the table is found but has no rows ( elements), an empty DataFrame is returned."], "reqs": ["requests", "bs4.BeautifulSoup", "pandas", "io"], "raises": ["requests.exceptions.HTTPError: If the HTTP request fails (e.g., due to connection issues or", "a non-successful status code like 404 or 500).", "ValueError: If no table with the specified 'table_id' is found on the webpage. The error message will be", "\"Table with the specified ID not found.\""], "example": [">>> f_851('https://example.com/data.html', 'table1')", "DataFrame:", "Name Age", "0 Alice 25", "1 Bob 30"]}} -{"task_id": "f_399", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_399(column, data):\n \"\"\"\n Analyze a list of employee data and calculate statistics for a given column. If the data list is empty,\n the sum will be 0 and mean, min, and max values will be NaN. The function also visualizes the data with\n a pie chart, using the Age column as labels.\n\n Parameters:\n column (str): The column to analyze. Valid values are 'Age', 'Salary', and 'Experience'.\n If invalid, the function will raise KeyError.\n data (list of lists): The employee data, where each list represents [Age, Salary, Experience].\n\n Returns:\n tuple: A tuple containing:\n - dict: A dictionary with the sum, mean, min, and max of the column.\n - Axes object: The pie chart visualizing the column data.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> data = [[25, 50000, 2], [30, 75000, 5], [35, 100000, 7], [40, 125000, 10], [45, 150000, 12]]\n >>> stats, ax = f_399('Salary', data)\n >>> stats\n {'sum': 500000, 'mean': 100000.0, 'min': 50000, 'max': 150000}\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " # Constants encapsulated within the function\n COLUMNS = [\"Age\", \"Salary\", \"Experience\"]\n\n df = pd.DataFrame(data, columns=COLUMNS)\n column_data = df[column]\n\n # Handle empty data\n if df.empty:\n result = {\"sum\": 0, \"mean\": np.nan, \"min\": np.nan, \"max\": np.nan}\n else:\n result = {\n \"sum\": np.sum(column_data),\n \"mean\": np.mean(column_data),\n \"min\": np.min(column_data),\n \"max\": np.max(column_data),\n }\n\n fig, ax = plt.subplots()\n ax.pie(column_data, labels=df[\"Age\"], autopct=\"%1.1f%%\")\n ax.set_title(f\"Pie Chart of {column}\")\n\n return result, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Tests the 'Salary' column with normal data\n data = [\n [25, 50000, 2],\n [30, 75000, 5],\n [35, 100000, 7],\n [40, 125000, 10],\n [45, 150000, 12],\n ]\n stats, ax = f_399(\"Salary\", data)\n self.assertEqual(\n stats, {\"sum\": 500000, \"mean\": 100000.0, \"min\": 50000, \"max\": 150000}\n )\n def test_case_2(self):\n # Tests the 'Experience' column\n data = [\n [26, 52000, 3],\n [31, 76000, 6],\n [36, 101000, 8],\n [41, 126000, 11],\n [46, 151000, 13],\n ]\n stats, ax = f_399(\"Experience\", data)\n self.assertEqual(stats, {\"sum\": 41, \"mean\": 8.2, \"min\": 3, \"max\": 13})\n def test_case_3(self):\n # Tests the 'Age' column\n data = [\n [27, 53000, 4],\n [32, 77000, 7],\n [37, 102000, 9],\n [42, 127000, 12],\n [47, 152000, 14],\n ]\n stats, ax = f_399(\"Age\", data)\n self.assertEqual(stats, {\"sum\": 185, \"mean\": 37.0, \"min\": 27, \"max\": 47})\n def test_case_4(self):\n # Test edge case when data is empty\n data = []\n stats, ax = f_399(\"Salary\", data)\n self.assertEqual(\n stats, {\"sum\": 0, \"mean\": np.nan, \"min\": np.nan, \"max\": np.nan}\n )\n def test_case_5(self):\n # Tests with a single data entry\n data = [[30, 75000, 5]]\n stats, ax = f_399(\"Age\", data)\n self.assertEqual(stats, {\"sum\": 30, \"mean\": 30.0, \"min\": 30, \"max\": 30})\n self.assertTrue(\n isinstance(ax, plt.Axes),\n \"The plotting object is not an instance of matplotlib.axes._axes.Axes\",\n )\n def test_case_6(self):\n # Tests handling of an invalid column name\n data = [[25, 50000, 2], [30, 75000, 5]]\n with self.assertRaises(KeyError):\n f_399(\"InvalidColumn\", data)\n def test_case_7(self):\n # Tests that the pie chart is correctly generated for given data\n data = [\n [25, 50000, 2],\n [30, 75000, 5],\n [35, 100000, 7],\n [40, 125000, 10],\n [45, 150000, 12],\n ]\n _, ax = f_399(\"Salary\", data)\n # Verify the number of pie slices matches the number of data points\n self.assertEqual(\n len(ax.patches),\n len(data),\n \"The number of pie slices does not match the number of data points.\",\n )\n # Optionally, check for the presence of labels (Ages)\n labels = [str(age) for age, _, _ in data] # Extracting age labels from data\n plot_labels = [text.get_text() for text in ax.texts]\n self.assertTrue(\n all(label in plot_labels for label in labels),\n \"Not all expected labels are present in the plot.\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.mean", "numpy.nan", "numpy.min", "numpy.max", "numpy.sum", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["numpy", "pandas", "matplotlib"], "doc": {"description": ["Analyze a list of employee data and calculate statistics for a given column. If the data list is empty,", "the sum will be 0 and mean, min, and max values will be NaN. The function also visualizes the data with", "a pie chart, using the Age column as labels."], "note": [], "params": ["column (str): The column to analyze. Valid values are 'Age', 'Salary', and 'Experience'.", "If invalid, the function will raise KeyError.", "data (list of lists): The employee data, where each list represents [Age, Salary, Experience]."], "returns": ["tuple: A tuple containing:", "dict: A dictionary with the sum, mean, min, and max of the column.", "Axes object: The pie chart visualizing the column data."], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [[25, 50000, 2], [30, 75000, 5], [35, 100000, 7], [40, 125000, 10], [45, 150000, 12]]", ">>> stats, ax = f_399('Salary', data)", ">>> stats", "{'sum': 500000, 'mean': 100000.0, 'min': 50000, 'max': 150000}", ">>> type(ax)", ""]}} -{"task_id": "f_612", "prompt": "import os\nimport shutil\nimport glob\n\ndef f_612(source_dir, dest_dir, extension):\n \"\"\"\n Move all files with a particular extension from one directory to another.\n \n Parameters:\n - source_dir (str): The source directory.\n - dest_dir (str): The destination directory.\n - extension (str): The file extension.\n\n Returns:\n - result (int): The count of files that were moved. \n\n Requirements:\n - os\n - shutil\n - glob\n \n Example:\n >>> f_612('path_to_source_dir', 'path_to_dest_dir', '.txt')\n 10\n \"\"\"", "canonical_solution": " files = glob.glob(os.path.join(source_dir, f'*.{extension}'))\n \n for file in files:\n shutil.move(file, dest_dir)\n \n result = len(files)\n\n return result", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n for d in ['./source', './destination', './src', './dst', './s', './d']:\n if os.path.exists(d):\n shutil.rmtree(d)\n def test_case_1(self):\n # Create source directory\n if os.path.exists('./source'):\n shutil.rmtree('./source')\n os.mkdir('./source')\n # Create destination directory\n if os.path.exists('./destination'):\n shutil.rmtree('./destination')\n os.mkdir('./destination')\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./source', filename), 'w') as f:\n f.write('test')\n # Run function\n f_612('./source', './destination', 'txt')\n # Check files\n for d in ['./destination', './source']:\n if d == './source':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx'))) \n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n # Remove files\n shutil.rmtree('./source')\n shutil.rmtree('./destination')\n def test_case_2(self):\n # Create source directory\n if os.path.exists('./src'):\n shutil.rmtree('./src')\n os.mkdir('./src')\n # Create destination directory\n if os.path.exists('./dst'):\n shutil.rmtree('./dst')\n os.mkdir('./dst')\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./src', filename), 'w') as f:\n f.write('test')\n # Run function\n f_612('./src', './dst', 'txt')\n # Check files\n for d in ['./dst', './src']:\n if d == './src':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx'))) \n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n # Remove files\n shutil.rmtree('./src')\n shutil.rmtree('./dst')\n def test_case_3(self):\n # Create source directory\n if os.path.exists('./s'):\n shutil.rmtree('./s')\n os.mkdir('./s')\n # Create destination directory\n if os.path.exists('./d'):\n shutil.rmtree('./d')\n os.mkdir('./d')\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./s', filename), 'w') as f:\n f.write('test')\n # Run function\n f_612('./s', './d', 'txt')\n # Check files\n for d in ['./d', './s']:\n if d == './s':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx'))) \n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n # Remove files\n shutil.rmtree('./s')\n shutil.rmtree('./d')\n def test_case_4(self):\n # Create source directory\n if os.path.exists('./s'):\n shutil.rmtree('./s')\n os.mkdir('./s')\n # Create destination directory\n if os.path.exists('./destination'):\n shutil.rmtree('./destination')\n os.mkdir('./destination')\n # Create files\n for filename in ['bbb.txt', 'a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./s', filename), 'w') as f:\n f.write('test')\n # Run function\n f_612('./s', './destination', 'txt')\n # Check files\n for d in ['./destination', './s']:\n if d == './s':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx'))) \n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n # Remove files\n shutil.rmtree('./s')\n shutil.rmtree('./destination')\n def test_case_5(self):\n # Create source directory\n if os.path.exists('./source'):\n shutil.rmtree('./source')\n os.mkdir('./source')\n # Create destination directory\n if os.path.exists('./d'):\n shutil.rmtree('./d')\n os.mkdir('./d')\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./source', filename), 'w') as f:\n f.write('xxx')\n # Run function\n f_612('./source', './d', 'docx')\n # Check files\n for d in ['./d', './source']:\n if d == './source':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx')))\n self.assertFalse(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertFalse(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertFalse(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))", "apis": ["os.path", "os.path.join", "glob.glob", "shutil.move"], "libs": ["glob", "shutil", "os"], "doc": {"description": ["Move all files with a particular extension from one directory to another."], "note": [], "params": ["source_dir (str): The source directory.", "dest_dir (str): The destination directory.", "extension (str): The file extension."], "returns": ["result (int): The count of files that were moved."], "reqs": ["os", "shutil", "glob"], "raises": [], "example": [">>> f_612('path_to_source_dir', 'path_to_dest_dir', '.txt')", "10"]}} -{"task_id": "f_918", "prompt": "import pytz\nfrom dateutil.parser import parse\n\n# Constants\nTIME_FORMAT = \"%d/%m/%y %H:%M:%S.%f\"\n\n\ndef f_918(time_string, from_tz, to_tz):\n \"\"\"\n Converts a time string from one timezone to another, considering various cases such as daylight saving time.\n\n Parameters:\n - time_string (str): A time string in the format 'dd/mm/yy HH:MM:SS.fff'. This string should represent a valid date and time.\n - from_tz (str): The timezone of the given time string. The timezone should be a valid IANA timezone name (e.g., 'UTC', 'America/New_York').\n - to_tz (str): The target timezone to which the time string should be converted. This should also be a valid IANA timezone name (e.g., 'Asia/Tokyo').\n\n Returns:\n - str: The converted time string in the format 'dd/mm/yy HH:MM:SS.fff'. The conversion takes into account any differences in daylight saving rules between the source and target timezones.\n\n Requirements:\n - pytz\n - dateutil\n\n Example:\n >>> f_918('30/03/09 16:31:32.123', 'UTC', 'America/New_York')\n '30/03/09 12:31:32.123000'\n\n Note: The example assumes no daylight saving time shift between the given timezones at the specified date and time.\n \"\"\"", "canonical_solution": " from_zone = pytz.timezone(from_tz)\n to_zone = pytz.timezone(to_tz)\n dt = parse(time_string, dayfirst=True)\n dt = from_zone.localize(dt)\n dt = dt.astimezone(to_zone)\n\n return dt.strftime(TIME_FORMAT)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_918\"\"\"\n def test_utc_to_est(self):\n \"\"\"\n Test conversion from UTC to Eastern Standard Time.\n \"\"\"\n result = f_918(\"30/03/09 16:31:32.123\", \"UTC\", \"America/New_York\")\n expected = \"30/03/09 12:31:32.123000\" # Adjusted for daylight saving time if applicable\n self.assertEqual(result, expected)\n def test_est_to_utc(self):\n \"\"\"\n Test conversion from Eastern Standard Time to UTC.\n \"\"\"\n result = f_918(\"30/03/09 12:31:32.123\", \"America/New_York\", \"UTC\")\n expected = \"30/03/09 16:31:32.123000\" # Adjusted for daylight saving time if applicable\n self.assertEqual(result, expected)\n def test_utc_to_ist(self):\n \"\"\"\n Test conversion from UTC to Indian Standard Time.\n \"\"\"\n result = f_918(\"01/04/09 00:00:00.000\", \"UTC\", \"Asia/Kolkata\")\n expected = \"01/04/09 05:30:00.000000\" # IST is UTC+5:30\n self.assertEqual(result, expected)\n def test_ist_to_utc(self):\n \"\"\"\n Test conversion from Indian Standard Time to UTC.\n \"\"\"\n result = f_918(\"01/04/09 05:30:00.000\", \"Asia/Kolkata\", \"UTC\")\n expected = \"01/04/09 00:00:00.000000\" # IST is UTC+5:30\n self.assertEqual(result, expected)\n def test_utc_to_gmt(self):\n \"\"\"\n Test conversion from UTC to GMT (should be the same).\n \"\"\"\n result = f_918(\"15/04/09 10:30:00.000\", \"UTC\", \"GMT\")\n expected = \"15/04/09 10:30:00.000000\" # GMT and UTC are the same\n self.assertEqual(result, expected)", "apis": ["dateutil.parser.parse", "pytz.timezone"], "libs": ["dateutil", "pytz"], "doc": {"description": ["Converts a time string from one timezone to another, considering various cases such as daylight saving time."], "note": ["The example assumes no daylight saving time shift between the given timezones at the specified date and time."], "params": ["time_string (str): A time string in the format 'dd/mm/yy HH:MM:SS.fff'. This string should represent a valid date and time.", "from_tz (str): The timezone of the given time string. The timezone should be a valid IANA timezone name (e.g., 'UTC', 'America/New_York').", "to_tz (str): The target timezone to which the time string should be converted. This should also be a valid IANA timezone name (e.g., 'Asia/Tokyo')."], "returns": ["str: The converted time string in the format 'dd/mm/yy HH:MM:SS.fff'. The conversion takes into account any differences in daylight saving rules between the source and target timezones."], "reqs": ["pytz", "dateutil"], "raises": [], "example": [">>> f_918('30/03/09 16:31:32.123', 'UTC', 'America/New_York')", "'30/03/09 12:31:32.123000'"]}} -{"task_id": "f_777", "prompt": "import pandas as pd\nimport string\n\ndef f_777(word):\n \"\"\"\n Creates a Pandas DataFrame from a single word, where each row contains a letter from the word \n and its 1-based position in the alphabet.\n\n Requirements:\n - pandas\n - string\n \n Parameters:\n - word (str): The word to create the DataFrame from. The word should be in lowercase and consist of alphabetic characters only.\n \n Returns:\n - pandas.DataFrame: A DataFrame with two columns: 'Letter' and 'Position', \n where 'Position' is the letter's position in the English alphabet.\n \n Examples:\n >>> f_777('abc')\n Letter Position\n 0 a 1\n 1 b 2\n 2 c 3\n\n >>> f_777('zoo')\n Letter Position\n 0 z 26\n 1 o 15\n 2 o 15\n \n Raises:\n - ValueError: If the input word is not in lowercase or contains non-alphabetic characters.\n \"\"\"", "canonical_solution": " if not word: # Check if the input word is empty and return an empty DataFrame\n return pd.DataFrame({'Letter': [], 'Position': []})\n elif not word.isalpha() or not word.islower():\n raise ValueError(\"Input word must be in lowercase alphabetic characters only.\")\n\n alphabet = string.ascii_lowercase\n positions = [alphabet.index(char) + 1 for char in word]\n df = pd.DataFrame({'Letter': list(word), 'Position': positions})\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_abc(self):\n \"\"\"Test with the word 'abc'.\"\"\"\n result = f_777('abc')\n expected = pd.DataFrame({'Letter': ['a', 'b', 'c'], 'Position': [1, 2, 3]})\n pd.testing.assert_frame_equal(result, expected)\n def test_xyz(self):\n \"\"\"Test with the word 'xyz'.\"\"\"\n result = f_777('xyz')\n expected = pd.DataFrame({'Letter': ['x', 'y', 'z'], 'Position': [24, 25, 26]})\n pd.testing.assert_frame_equal(result, expected)\n def test_mixed_case_error(self):\n \"\"\"Test with a mixed case word, expecting a ValueError.\"\"\"\n with self.assertRaises(ValueError):\n f_777('AbC')\n def test_non_alpha_error(self):\n \"\"\"Test with a non-alphabetic word, expecting a ValueError.\"\"\"\n with self.assertRaises(ValueError):\n f_777('123')\n def test_empty_string(self):\n \"\"\"Test with an empty string, expecting an empty DataFrame.\"\"\"\n result = f_777('')\n expected = pd.DataFrame({'Letter': [], 'Position': []})\n pd.testing.assert_frame_equal(result, expected)", "apis": ["string.ascii_lowercase", "pandas.DataFrame"], "libs": ["string", "pandas"], "doc": {"description": ["Creates a Pandas DataFrame from a single word, where each row contains a letter from the word", "and its 1-based position in the alphabet.", ">>> f_777('zoo')", "Letter Position", "0 z 26", "1 o 15", "2 o 15"], "note": [], "params": ["word (str): The word to create the DataFrame from. The word should be in lowercase and consist of alphabetic characters only."], "returns": ["pandas.DataFrame: A DataFrame with two columns: 'Letter' and 'Position',", "where 'Position' is the letter's position in the English alphabet."], "reqs": ["pandas", "string"], "raises": ["ValueError: If the input word is not in lowercase or contains non-alphabetic characters."], "example": ["Examples:", ">>> f_777('abc')", "Letter Position", "0 a 1", "1 b 2", "2 c 3"]}} -{"task_id": "f_337", "prompt": "import numpy as np\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\n\n\ndef f_337(df1, df2, column1=\"feature1\", column2=\"feature2\"):\n \"\"\"Merge datasets, perform KMeans clustering, then return cluster labels and scatterplot.\n\n Each dataset is assumed to contain at least one id column and one feature column. The column to process\n is specified for df1 and df2 via column1 and column2, respectively. KMeans clustering is applied\n with k=2 and n_init=10. Resulting scatterplot shows column1 on the x-axis, column2 on the y-axis,\n and predicted cluster as color.\n\n Parameters:\n - df1 (pd.DataFrame): Dataframe with columns 'id' and feature columns including column1.\n - df2 (pd.DataFrame): Dataframe with columns 'id' and feature columns including column2.\n - column1 (str): Name of column containing features to model in df1. Defaults to \"feature1\".\n - column2 (str): Name of column containing features to model in df2. Defaults to \"feature2\".\n\n Returns:\n - labels (np.ndarray): Cluster labels for each data point (dtype=int32).\n - ax (matplotlib.axes._axes.Axes): The plotted figure's Axes object.\n\n Requirements:\n - sklearn.cluster.KMeans\n - matplotlib.pyplot\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6]})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature2': [2.3, 4.5, 6.7]})\n >>> labels, ax = f_337(df1, df2)\n >>> type(labels)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " df = pd.merge(df1, df2, on=\"id\")\n X = df[[column1, column2]]\n\n kmeans = KMeans(n_clusters=2, n_init=10)\n kmeans.fit(X)\n labels = kmeans.labels_\n\n _, ax = plt.subplots()\n ax.scatter(X[column1], X[column2], c=kmeans.labels_)\n ax.set_xlabel(column1)\n ax.set_ylabel(column2)\n\n return labels, ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Sample dataframes for testing\n self.df1_base = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5], \"feature1\": [1.2, 3.4, 5.6, 7.8, 9.0]}\n )\n self.df2_base = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5], \"feature2\": [2.3, 4.5, 6.7, 8.9, 10.1]}\n )\n def tearDown(self):\n plt.close(\"all\")\n def test_case_1(self):\n # Test scatterplot\n _, ax = f_337(self.df1_base, self.df2_base)\n self.assertIsInstance(ax, matplotlib.axes._axes.Axes)\n self.assertEqual(ax.get_xlabel(), \"feature1\")\n self.assertEqual(ax.get_ylabel(), \"feature2\")\n def test_case_2(self):\n # Expect 2 clusters\n labels, _ = f_337(self.df1_base, self.df2_base)\n self.assertEqual(len(labels), 5)\n self.assertEqual(len(np.unique(labels)), 2)\n def test_case_3(self):\n # Mixed valid data types\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [1, 2, 3]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"feature2\": [1.1, 2.2, 3.3]})\n labels, _ = f_337(df1, df2)\n self.assertEqual(len(labels), 3)\n def test_case_4(self):\n # Partial matches\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [1.2, 3.4, 5.6]})\n df2 = pd.DataFrame({\"id\": [1, 2, 6], \"feature2\": [1.2, 3.1, 6.7]})\n labels, _ = f_337(df1, df2)\n self.assertEqual(len(labels), 2)\n self.assertEqual(len(np.unique(labels)), 2)\n def test_case_5(self):\n # Should fail when there's no matching id\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [1.2, 3.4, 5.6]})\n df2 = pd.DataFrame({\"id\": [4, 5, 6], \"feature2\": [2.3, 4.5, 6.7]})\n with self.assertRaises(ValueError):\n f_337(df1, df2)\n def test_case_6(self):\n # Should fail on non-numeric columns\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [\"a\", \"b\", \"c\"]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"feature2\": [1.1, 2.2, 3.3]})\n with self.assertRaises(Exception):\n f_337(df1, df2)\n def test_case_7(self):\n # Should fail on missing value\n df1 = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5], \"feature1\": [1.2, np.nan, 5.6, 7.8, 9.0]}\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5], \"feature2\": [2.3, 4.5, np.nan, 8.9, 10.1]}\n )\n with self.assertRaises(ValueError):\n f_337(df1, df2)", "apis": ["sklearn.cluster.KMeans", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "sklearn"], "doc": {"description": ["Merge datasets, perform KMeans clustering, then return cluster labels and scatterplot.", "Each dataset is assumed to contain at least one id column and one feature column. The column to process", "is specified for df1 and df2 via column1 and column2, respectively. KMeans clustering is applied", "with k=2 and n_init=10. Resulting scatterplot shows column1 on the x-axis, column2 on the y-axis,", "and predicted cluster as color."], "note": [], "params": ["df1 (pd.DataFrame): Dataframe with columns 'id' and feature columns including column1.", "df2 (pd.DataFrame): Dataframe with columns 'id' and feature columns including column2.", "column1 (str): Name of column containing features to model in df1. Defaults to \"feature1\".", "column2 (str): Name of column containing features to model in df2. Defaults to \"feature2\"."], "returns": ["labels (np.ndarray): Cluster labels for each data point (dtype=int32).", "ax (matplotlib.axes._axes.Axes): The plotted figure's Axes object."], "reqs": ["sklearn.cluster.KMeans", "matplotlib.pyplot"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6]})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature2': [2.3, 4.5, 6.7]})", ">>> labels, ax = f_337(df1, df2)", ">>> type(labels)", "", ">>> type(ax)", ""]}} -{"task_id": "f_803", "prompt": "import string\nimport random\n\n\ndef f_803(text, seed=None):\n \"\"\"\n Generates a password that mirrors the structure of the given text by replacing alphabetic\n characters with random ascii lowercase letters, digits with random single-digit numbers,\n spaces wth either a random digit or random lowercase letter at equal probabilities, and\n leaving other characters unchanged.\n\n Parameters:\n - text (str): The text to be mirrored in the generated password. Must not be empty.\n - seed (int, optional): Seed for the random number generator. Defaults to None (not set).\n\n Returns:\n - str: The generated password.\n\n Raises:\n - ValueError: If the input text is empty.\n\n Requirements:\n - random\n - string\n\n Note:\n - This function does not handle high Unicode characters and focuses only on ASCII values.\n\n Examples:\n >>> f_803(\"hello world! 123\", 0)\n 'mbqmp3jytre!v553'\n >>> f_803(\"apple321#\", seed=42)\n 'uahev901#'\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n if not text:\n raise ValueError(\"text cannot be empty.\")\n password = \"\"\n for char in text:\n random_lowercase = random.choice(string.ascii_lowercase)\n random_digit = random.choice(string.digits)\n if char.isalpha():\n password += random_lowercase\n elif char.isdigit():\n password += random_digit\n elif char == \" \":\n if random.random() < 0.5:\n password += random_lowercase\n else:\n password += random_digit\n else:\n password += char\n return password", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n result = f_803(\"Hello123\", seed=1)\n self.assertEqual(len(result), 8)\n for i, char in enumerate(\"Hello123\"):\n if char.isalpha():\n self.assertTrue(result[i].isalpha())\n elif char.isdigit():\n self.assertTrue(result[i].isdigit())\n def test_case_2(self):\n # Test basic case with alphabet only\n result = f_803(\"ABC\", seed=2)\n self.assertEqual(len(result), 3)\n self.assertTrue(all(char.isalpha() for char in result))\n def test_case_3(self):\n # Test basic case with digit only\n result = f_803(\"123\", seed=3)\n self.assertEqual(len(result), 3)\n self.assertTrue(all(char.isdigit() for char in result))\n def test_case_4(self):\n # Test basic case with whitespace, alphabet, number, special char\n text = \"Hello, world!\"\n result = f_803(text, seed=4)\n self.assertEqual(len(result), 13)\n for i, char in enumerate(text):\n result_char = result[i]\n if char.isalpha():\n self.assertTrue(result_char.isalpha())\n elif char.isdigit():\n self.assertTrue(result_char.isdigit())\n elif char == \" \":\n self.assertTrue(result_char.isalnum())\n else:\n self.assertEqual(result[i], char)\n def test_case_5(self):\n # Test handling empty string\n with self.assertRaises(Exception):\n f_803(\"\", seed=5)", "apis": ["string.ascii_lowercase", "random.seed", "random.choice", "string.digits", "random.random"], "libs": ["string", "random"], "doc": {"description": ["Generates a password that mirrors the structure of the given text by replacing alphabetic", "characters with random ascii lowercase letters, digits with random single-digit numbers,", "spaces wth either a random digit or random lowercase letter at equal probabilities, and", "leaving other characters unchanged."], "note": ["This function does not handle high Unicode characters and focuses only on ASCII values."], "params": ["text (str): The text to be mirrored in the generated password. Must not be empty.", "seed (int, optional): Seed for the random number generator. Defaults to None (not set)."], "returns": ["str: The generated password."], "reqs": ["random", "string"], "raises": ["ValueError: If the input text is empty."], "example": ["Examples:", ">>> f_803(\"hello world! 123\", 0)", "'mbqmp3jytre!v553'", ">>> f_803(\"apple321#\", seed=42)", "'uahev901#'"]}} -{"task_id": "f_889", "prompt": "from datetime import datetime\nimport numpy as np\nfrom dateutil.parser import parse\n\nLEAP_SECONDS = np.array(\n [\n 1972,\n 1973,\n 1974,\n 1975,\n 1976,\n 1977,\n 1978,\n 1979,\n 1980,\n 1981,\n 1982,\n 1983,\n 1985,\n 1988,\n 1990,\n 1993,\n 1994,\n 1997,\n 1999,\n 2006,\n 2009,\n 2012,\n 2015,\n 2016,\n 2020,\n ]\n)\n\n\ndef f_889(date_str):\n \"\"\"\n Calculate the total number of seconds elapsed from a given date until the current time,\n including any leap seconds that occurred in this period.\n\n Parameters:\n date_str (str): The date and time from which to calculate, in \"yyyy-mm-dd hh:mm:ss\" format.\n\n Returns:\n int: The total number of elapsed seconds, including leap seconds, since the given date.\n\n Requirements:\n - datetime.datetime\n - numpy\n - dateutil.parser.parse\n \n Note:\n This function uses the datetime, numpy, and dateutil.parser modules.\n The LEAP_SECONDS array should contain years when leap seconds were added.\n\n Example:\n >>> total_seconds = f_889('1970-01-01 00:00:00')\n >>> print(total_seconds)\n 1702597276\n \"\"\"", "canonical_solution": " given_date = parse(date_str)\n current_date = datetime.now()\n\n total_seconds = (current_date - given_date).total_seconds()\n\n # Count leap seconds that occurred between the two dates\n leap_seconds = np.sum(LEAP_SECONDS >= given_date.year)\n\n total_seconds += leap_seconds\n\n return int(total_seconds)", "test": "import unittest\nfrom datetime import datetime, timedelta\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_889.\"\"\"\n def test_recent_date(self):\n \"\"\"\n Test the function with a recent date.\n \"\"\"\n test_date = \"2022-01-01 00:00:00\"\n expected_result = (datetime.now() - datetime(2022, 1, 1)).total_seconds()\n expected_result += np.sum(LEAP_SECONDS >= 2022)\n self.assertEqual(f_889(test_date), int(expected_result))\n def test_date_before_leap_seconds(self):\n \"\"\"\n Test the function with a date before the introduction of leap seconds.\n \"\"\"\n test_date = \"1960-01-01 00:00:00\"\n expected_result = (datetime.now() - datetime(1960, 1, 1)).total_seconds()\n expected_result += np.sum(LEAP_SECONDS >= 1960)\n self.assertEqual(f_889(test_date), int(expected_result))\n def test_date_with_leap_second(self):\n \"\"\"\n Test the function with a date in a year when a leap second was added.\n \"\"\"\n test_date = \"2016-01-01 00:00:00\"\n expected_result = (datetime.now() - datetime(2016, 1, 1)).total_seconds()\n expected_result += np.sum(LEAP_SECONDS >= 2016)\n self.assertAlmostEqual(f_889(test_date), int(expected_result), delta=1)\n def test_future_date(self):\n \"\"\"\n Test the function with a future date.\n \"\"\"\n future_date = datetime.now() + timedelta(days=30)\n future_date_str = future_date.strftime(\"%Y-%m-%d %H:%M:%S\")\n result = f_889(future_date_str)\n expected_result = -30 * 24 * 3600 # Negative seconds for future dates\n # Allowing a margin of error of 1 second\n self.assertTrue(abs(result - expected_result) <= 1)\n def test_current_date(self):\n \"\"\"\n Test the function with the current date and time.\n \"\"\"\n current_date_str = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n self.assertEqual(f_889(current_date_str), 0)", "apis": ["dateutil.parser.parse", "numpy.array", "numpy.sum", "datetime.datetime.now"], "libs": ["dateutil", "numpy", "datetime"], "doc": {"description": ["Calculate the total number of seconds elapsed from a given date until the current time,", "including any leap seconds that occurred in this period."], "note": ["This function uses the datetime, numpy, and dateutil.parser modules.", "The LEAP_SECONDS array should contain years when leap seconds were added."], "params": ["date_str (str): The date and time from which to calculate, in \"yyyy-mm-dd hh:mm:ss\" format."], "returns": ["int: The total number of elapsed seconds, including leap seconds, since the given date."], "reqs": ["datetime.datetime", "numpy", "dateutil.parser.parse"], "raises": [], "example": [">>> total_seconds = f_889('1970-01-01 00:00:00')", ">>> print(total_seconds)", "1702597276"]}} -{"task_id": "f_830", "prompt": "import json\nimport pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\nimport matplotlib.pyplot as plt\n\n\ndef f_830(json_data: str, data_key: str):\n \"\"\"\n Processes a JSON string to extract numerical data, Min-Max normalize them,\n and generate a line plot.\n\n Parameters:\n - json_data (str): JSON formatted string containing the data.\n - data_key (str): Dot-separated full key path to access the numerical data within the JSON structure.\n\n Returns:\n - Tuple:\n - pd.Series: Original dataset in float64.\n - pd.Series or None: Dataset after Min-Max scaling in float64, or None if data is empty.\n - plt.Axes or None: Line plot of normalized data, or None if data is empty.\n\n Raises:\n - KeyError: if key path is not found in the given data.\n\n Requirements:\n - json\n - pandas\n - sklearn\n - matplotlib\n\n Notes:\n - The line plot includes labeled axes and a legend. It visualizes the original\n data with label \"Original Data\" and normalized ones as \"Normalized Data\".\n The function sets the plot title to \"Comparison of Original and Normalized Data\",\n with \"Index\" on the x-axis and \"Value\" on the y-axis.\n\n Example:\n >>> json_str = '{\"data\": {\"values\": [5, 10, 15, 20, 25]}}'\n >>> original_data, normalized_data, ax = f_830(json_str, 'data.values')\n >>> type(original_data), type(normalized_data), type(ax)\n (, , )\n \"\"\"", "canonical_solution": " data = json.loads(json_data)\n try:\n data = json.loads(json_data)\n for key in data_key.split(\".\"):\n data = data[key]\n values = pd.Series(data, dtype=pd.Float64Dtype)\n except KeyError:\n raise KeyError(f\"Key path '{data_key}' not found in the provided JSON data.\")\n\n if values.empty:\n return values, None, None\n\n scaler = MinMaxScaler()\n normalized_values = pd.Series(\n scaler.fit_transform(values.values.reshape(-1, 1)).flatten(),\n dtype=pd.Float64Dtype,\n )\n\n fig, ax = plt.subplots()\n ax.plot(values, label=\"Original Data\")\n ax.plot(normalized_values, label=\"Normalized Data\")\n ax.set_title(\"Comparison of Original and Normalized Data\")\n ax.set_xlabel(\"Index\")\n ax.set_ylabel(\"Value\")\n ax.legend()\n\n return values, normalized_values, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_data_extraction(self):\n json_str = '{\"data\": {\"values\": [0.5, 10, 15, 20]}}'\n data_key = \"data.values\"\n original_data, _, _ = f_830(json_str, data_key)\n expected_series = pd.Series([0.5, 10, 15, 20], dtype=pd.Float64Dtype)\n pd.testing.assert_series_equal(original_data, expected_series)\n def test_data_normalization(self):\n json_str = '{\"data\": {\"values\": [0, 10, 20, 30, 40]}}'\n data_key = \"data.values\"\n _, normalized_data, _ = f_830(json_str, data_key)\n expected_normalized = pd.Series(\n [0.0, 0.25, 0.5, 0.75, 1.0], dtype=pd.Float64Dtype\n )\n pd.testing.assert_series_equal(normalized_data, expected_normalized)\n def test_plot_properties(self):\n json_str = '{\"data\": {\"values\": [1, 2, 3, 4, 5]}}'\n data_key = \"data.values\"\n _, _, ax = f_830(json_str, data_key)\n self.assertEqual(ax.get_title(), \"Comparison of Original and Normalized Data\")\n self.assertEqual(ax.get_xlabel(), \"Index\")\n self.assertEqual(ax.get_ylabel(), \"Value\")\n legend_texts = [text.get_text() for text in ax.get_legend().get_texts()]\n self.assertIn(\"Original Data\", legend_texts)\n self.assertIn(\"Normalized Data\", legend_texts)\n def test_empty_data(self):\n json_str = '{\"data\": {\"values\": []}}'\n data_key = \"data.values\"\n original_data, normalized_data, ax = f_830(json_str, data_key)\n self.assertTrue(original_data.empty)\n self.assertIsNone(normalized_data)\n self.assertIsNone(ax)\n def test_non_uniform_data_spacing(self):\n json_str = '{\"data\": {\"values\": [1, 1, 2, 3, 5, 8]}}'\n data_key = \"data.values\"\n _, normalized_data, _ = f_830(json_str, data_key)\n expected_normalized = pd.Series(\n [0.0, 0.0, 0.142857, 0.285714, 0.571429, 1.0], dtype=pd.Float64Dtype\n )\n pd.testing.assert_series_equal(normalized_data, expected_normalized, atol=1e-6)\n def test_negative_values(self):\n json_str = '{\"data\": {\"values\": [-50, -20, 0, 20, 50]}}'\n data_key = \"data.values\"\n _, normalized_data, _ = f_830(json_str, data_key)\n expected_normalized = pd.Series(\n [0.0, 0.3, 0.5, 0.7, 1.0], dtype=pd.Float64Dtype\n )\n pd.testing.assert_series_equal(normalized_data, expected_normalized, atol=1e-5)\n def test_nested_json_structure(self):\n json_str = '{\"data\": {\"deep\": {\"deeper\": {\"values\": [2, 4, 6, 8, 10]}}}}'\n data_key = \"data.deep.deeper.values\"\n original_data, _, _ = f_830(json_str, data_key)\n expected_series = pd.Series([2, 4, 6, 8, 10], dtype=pd.Float64Dtype)\n pd.testing.assert_series_equal(original_data, expected_series)\n def test_complex_json_structure(self):\n json_str = \"\"\"\n {\n \"metadata\": {\n \"source\": \"sensor_array\",\n \"timestamp\": \"2023-04-11\"\n },\n \"readings\": {\n \"temperature\": [20, 22, 21, 23, 24],\n \"humidity\": [30, 32, 31, 33, 34],\n \"data\": {\n \"deep\": {\n \"deeper\": {\n \"values\": [100, 200, 300, 400, 500]\n },\n \"another_level\": {\n \"info\": \"This should not be processed\"\n }\n }\n }\n }\n }\"\"\"\n data_key = \"readings.data.deep.deeper.values\"\n original_data, normalized_data, ax = f_830(json_str, data_key)\n expected_series = pd.Series([100, 200, 300, 400, 500], dtype=pd.Float64Dtype)\n pd.testing.assert_series_equal(original_data, expected_series)\n expected_normalized = pd.Series(\n [0.0, 0.25, 0.5, 0.75, 1.0], dtype=pd.Float64Dtype\n )\n pd.testing.assert_series_equal(normalized_data, expected_normalized, atol=1e-5)\n self.assertIsInstance(ax, plt.Axes)", "apis": ["sklearn.preprocessing.MinMaxScaler", "pandas.Float64Dtype", "pandas.Series", "json.loads", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas", "json", "sklearn"], "doc": {"description": ["Processes a JSON string to extract numerical data, Min-Max normalize them,", "and generate a line plot.", "Notes:", "- The line plot includes labeled axes and a legend. It visualizes the original", "data with label \"Original Data\" and normalized ones as \"Normalized Data\".", "The function sets the plot title to \"Comparison of Original and Normalized Data\",", "with \"Index\" on the x-axis and \"Value\" on the y-axis."], "note": [], "params": ["json_data (str): JSON formatted string containing the data.", "data_key (str): Dot-separated full key path to access the numerical data within the JSON structure."], "returns": ["Tuple:", "pd.Series: Original dataset in float64.", "pd.Series or None: Dataset after Min-Max scaling in float64, or None if data is empty.", "plt.Axes or None: Line plot of normalized data, or None if data is empty."], "reqs": ["json", "pandas", "sklearn", "matplotlib"], "raises": ["KeyError: if key path is not found in the given data."], "example": [">>> json_str = '{\"data\": {\"values\": [5, 10, 15, 20, 25]}}'", ">>> original_data, normalized_data, ax = f_830(json_str, 'data.values')", ">>> type(original_data), type(normalized_data), type(ax)", "(, , )"]}} -{"task_id": "f_903", "prompt": "import numpy as np\nimport random\nimport itertools\nimport pandas as pd\n\n# Constants\nPLANETS = [\n \"Mercury\",\n \"Venus\",\n \"Earth\",\n \"Mars\",\n \"Jupiter\",\n \"Saturn\",\n \"Uranus\",\n \"Neptune\",\n]\nELEMENTS = [\n \"Hydrogen\",\n \"Helium\",\n \"Oxygen\",\n \"Carbon\",\n \"Nitrogen\",\n \"Magnesium\",\n \"Silicon\",\n \"Iron\",\n \"Nickel\",\n]\n\n\ndef f_903():\n \"\"\"\n Generate a DataFrame where each row contains random planet-element pairs.\n Each pair is formatted as 'Planet:Element'. The number of rows is determined by\n the number of planets, and each row will contain as many planet-element pairs as there are elements.\n\n Parameters:\n - None\n\n Returns:\n pandas.DataFrame: A DataFrame where each cell contains a string in the format 'Planet:Element'.\n The DataFrame has a number of rows equal to the number of planets and\n a number of columns equal to the number of elements.\n\n Requirements:\n - numpy\n - random\n - itertools\n - pandas\n\n Example:\n >>> random.seed(0)\n >>> planet_elements_table = f_903()\n >>> planet_elements_table.head(2)\n Hydrogen Helium Oxygen Carbon Nitrogen Magnesium Silicon Iron Nickel\n 0 Uranus:Silicon Earth:Silicon Neptune:Silicon Neptune:Nickel Uranus:Hydrogen Jupiter:Iron Neptune:Nitrogen Earth:Nickel Uranus:Helium\n 1 Venus:Magnesium Saturn:Helium Mars:Nitrogen Mercury:Helium Jupiter:Nitrogen Venus:Oxygen Neptune:Magnesium Mercury:Iron Venus:Helium\n \"\"\"", "canonical_solution": " # Generate all possible pairs\n pairs = [\n f\"{planet}:{element}\"\n for planet, element in itertools.product(PLANETS, ELEMENTS)\n ]\n # Shuffle the pairs to ensure randomness\n random.shuffle(pairs)\n\n # Convert the list of pairs into a numpy array, then reshape it to fit the DataFrame dimensions\n data = np.array(pairs).reshape(len(PLANETS), len(ELEMENTS))\n # Create the DataFrame with ELEMENTS as column headers\n df = pd.DataFrame(data, columns=ELEMENTS)\n\n return df", "test": "import unittest\nimport itertools\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for `f_903`.\"\"\"\n def test_basic_structure(self):\n \"\"\"Test the basic structure of the table.\"\"\"\n random.seed(0)\n table = f_903()\n # Verify the structure of the table\n self.assertEqual(len(table), len(PLANETS))\n self.assertEqual(list(table.columns), ELEMENTS)\n def test_pair_existence(self):\n \"\"\"Test the existence of planet-element pairs.\"\"\"\n random.seed(1)\n table = f_903()\n # Verify all planet-element pairs are present\n all_pairs = set(f\"{p}:{e}\" for p, e in itertools.product(PLANETS, ELEMENTS))\n generated_pairs = set(table.values.flatten())\n self.assertEqual(all_pairs, generated_pairs)\n # Verify no extra pairs are present\n self.assertEqual(len(all_pairs), len(generated_pairs))\n def test_data_type(self):\n \"\"\"Test the data type of the table and its elements.\"\"\"\n random.seed(2)\n table = f_903()\n # Check the data type of the table and its elements\n self.assertIsInstance(table, pd.DataFrame)\n self.assertTrue(all(isinstance(cell, str) for cell in table.values.flatten()))\n def test_data_format(self):\n \"\"\"Test the format of the elements in the table.\"\"\"\n random.seed(3)\n table = f_903()\n # Check the format of the elements in the table\n self.assertTrue(\n all(\n \":\" in cell and len(cell.split(\":\")) == 2\n for cell in table.values.flatten()\n )\n )\n def test_uniqueness(self):\n \"\"\"Test the uniqueness of the pairs.\"\"\"\n random.seed(4)\n table = f_903()\n # Check uniqueness of the pairs\n generated_pairs = table.values.flatten()\n self.assertEqual(len(generated_pairs), len(set(generated_pairs)))", "apis": ["random.shuffle", "numpy.array", "pandas.DataFrame", "itertools.product"], "libs": ["numpy", "pandas", "random", "itertools"], "doc": {"description": ["Generate a DataFrame where each row contains random planet-element pairs.", "Each pair is formatted as 'Planet:Element'. The number of rows is determined by", "the number of planets, and each row will contain as many planet-element pairs as there are elements."], "note": [], "params": ["None"], "returns": ["pandas.DataFrame: A DataFrame where each cell contains a string in the format 'Planet:Element'.", "The DataFrame has a number of rows equal to the number of planets and", "a number of columns equal to the number of elements."], "reqs": ["numpy", "random", "itertools", "pandas"], "raises": [], "example": [">>> random.seed(0)", ">>> planet_elements_table = f_903()", ">>> planet_elements_table.head(2)", "Hydrogen Helium Oxygen Carbon Nitrogen Magnesium Silicon Iron Nickel", "0 Uranus:Silicon Earth:Silicon Neptune:Silicon Neptune:Nickel Uranus:Hydrogen Jupiter:Iron Neptune:Nitrogen Earth:Nickel Uranus:Helium", "1 Venus:Magnesium Saturn:Helium Mars:Nitrogen Mercury:Helium Jupiter:Nitrogen Venus:Oxygen Neptune:Magnesium Mercury:Iron Venus:Helium"]}} -{"task_id": "f_417", "prompt": "from collections import Counter\nimport random\nimport matplotlib.pyplot as plt\n\n\ndef f_417(num_rolls, num_dice, plot_path=None, random_seed=0):\n \"\"\"Simulate rolling a certain number of a standard six-sided dice several times, then\n identify and display the distribution of the sums of the dice rolls in a bar plot.\n\n Parameters:\n - num_rolls (int): The number of times to roll the dice.\n - num_dice (int): The number of dice to roll each time.\n - plot_path (str, optional): Path to save the generated plot. If not provided, plot is not saved.\n - random_seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n tuple: A tuple containing the following elements:\n - Counter: A Counter object with the count of each possible sum.\n - Axes: A matplotlib Axes object representing the bar plot of the Distribution of Dice Roll Sums,\n with Sum of Dice Roll on the x-axis and count on the y-axis.\n\n Requirements:\n - collections.Counter\n - random\n - matplotlib.pyplot\n\n Example:\n >>> result, ax = f_417(10000, 2, 'output.png')\n >>> type(result)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " POSSIBLE_VALUES = list(range(1, 7))\n\n random.seed(random_seed)\n\n sums = []\n for _ in range(num_rolls):\n roll = [random.choice(POSSIBLE_VALUES) for _ in range(num_dice)]\n sums.append(sum(roll))\n\n sums_counter = Counter(sums)\n\n labels, values = zip(*sums_counter.items())\n\n plt.bar(labels, values)\n plt.xlabel(\"Sum of Dice Roll\")\n plt.ylabel(\"Count\")\n plt.title(\"Distribution of Dice Roll Sums\")\n ax = plt.gca()\n if plot_path:\n plt.savefig(plot_path)\n\n return sums_counter, ax", "test": "import unittest\nimport os\nfrom collections import Counter\nimport tempfile\nimport shutil\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Create a temporary directory to store plots\n self.test_dir = tempfile.mkdtemp()\n def tearDown(self):\n # Close matplotlib plots and remove temporary directory\n plt.close(\"all\")\n shutil.rmtree(self.test_dir)\n def test_case_1(self):\n # Test basic functionality with 100 rolls and 2 dice\n result, ax = f_417(100, 2, random_seed=42)\n self.assertIsInstance(result, Counter)\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_2(self):\n # Test plot saving functionality\n plot_path = os.path.join(self.test_dir, \"test_plot.png\")\n result, ax = f_417(1000, 1, plot_path, random_seed=42)\n self.assertIsInstance(result, Counter)\n self.assertTrue(os.path.exists(plot_path))\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_3(self):\n # Test with a larger number of dice\n result, ax = f_417(500, 5, random_seed=42)\n self.assertIsInstance(result, Counter)\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_4(self):\n # Test with the minimum possible inputs\n result, ax = f_417(1, 1, random_seed=42)\n self.assertIsInstance(result, Counter)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(len(result), 1) # Only one possible sum with 1 roll of 1 die\n def test_case_5(self):\n # Test the effect of different random seeds on the result consistency\n result1, _ = f_417(100, 2, random_seed=42)\n result2, _ = f_417(100, 2, random_seed=43)\n self.assertNotEqual(\n result1, result2, \"Results should differ with different seeds\"\n )\n def test_case_6(self):\n # Test plot detail correctness (labels, title)\n plot_path = os.path.join(self.test_dir, \"test_plot_detail.png\")\n _, ax = f_417(10, 2, plot_path, random_seed=42)\n self.assertTrue(\n \"sum of dice roll\" in ax.get_xlabel().lower(), \"X-axis label is incorrect\"\n )\n self.assertEqual(ax.get_ylabel(), \"Count\", \"Y-axis label is incorrect\")\n self.assertTrue(\n \"distribution of dice roll sums\" in ax.get_title().lower(),\n \"Plot title is incorrect\",\n )\n def test_case_7(self):\n # Test data correctness with a manually calculated example\n result, _ = f_417(2, 1, random_seed=42)\n expected = Counter({6: 1, 1: 1})\n self.assertEqual(\n result, expected, \"Data distribution does not match expected outcome\"\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["collections.Counter", "matplotlib.pyplot.xlabel", "random.seed", "random.choice", "matplotlib.pyplot.title", "matplotlib.pyplot.savefig", "matplotlib.pyplot.bar", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca"], "libs": ["matplotlib", "collections", "random"], "doc": {"description": ["Simulate rolling a certain number of a standard six-sided dice several times, then", "identify and display the distribution of the sums of the dice rolls in a bar plot."], "note": [], "params": ["num_rolls (int): The number of times to roll the dice.", "num_dice (int): The number of dice to roll each time.", "plot_path (str, optional): Path to save the generated plot. If not provided, plot is not saved.", "random_seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["tuple: A tuple containing the following elements:", "Counter: A Counter object with the count of each possible sum.", "Axes: A matplotlib Axes object representing the bar plot of the Distribution of Dice Roll Sums,", "with Sum of Dice Roll on the x-axis and count on the y-axis."], "reqs": ["collections.Counter", "random", "matplotlib.pyplot"], "raises": [], "example": [">>> result, ax = f_417(10000, 2, 'output.png')", ">>> type(result)", "", ">>> type(ax)", ""]}} -{"task_id": "f_834", "prompt": "import binascii\nimport string\nimport random\n\ndef f_834(length):\n \"\"\"\n Generate a random hexadecimal string of a given length and then attempt to decode it in ASCII.\n The resulting ASCII string may contain non-printable characters\n or be shorter than the input length.\n\n Parameters:\n length (int): The length of the hexadecimal string.\n\n Returns:\n str: The decoded ASCII string.\n\n Requirements:\n - binascii\n - string\n - random\n\n Example:\n >>> random.seed(0)\n >>> f_834(6)\n '\\\\x18'\n >>> f_834(8)\n '\u01a4'\n \"\"\"", "canonical_solution": " HEX_CHARS = string.hexdigits.lower()\n hex_string = \"\".join(random.choice(HEX_CHARS) for _ in range(length))\n return binascii.unhexlify(hex_string).decode(\"utf-8\", \"ignore\")", "test": "import unittest\nimport string\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_834\"\"\"\n def test_correct_length(self):\n \"\"\"Test the length of the hexadecimal string before decoding.\"\"\"\n random.seed(2)\n length = 8\n HEX_CHARS = string.hexdigits.lower()\n hex_string = \"\".join(random.choice(HEX_CHARS) for _ in range(length))\n result = f_834(length)\n # Check if the length of the hexadecimal string before decoding is correct\n self.assertEqual(len(hex_string), length)\n self.assertEqual(result, \"]\")\n def test_correct_type(self):\n \"\"\"Test the type of the output.\"\"\"\n random.seed(4)\n result = f_834(6)\n self.assertIsInstance(result, str)\n self.assertEqual(result, \"y<\")\n def test_non_empty_string_positive_length(self):\n \"\"\"Test the output for a positive length.\"\"\"\n random.seed(6)\n result = f_834(6)\n self.assertNotEqual(result, \"\")\n self.assertEqual(result, \"\\x10\")\n def test_zero_length(self):\n \"\"\"Test the output for a zero length.\"\"\"\n random.seed(8)\n result = f_834(0)\n self.assertEqual(result, \"\")\n def test_negative_length_handling(self):\n \"\"\"Test the output for a negative length.\"\"\"\n random.seed(10)\n result = f_834(-1)\n self.assertEqual(result, \"\")", "apis": ["string.hexdigits", "binascii.unhexlify", "random.choice", "string.hexdigits.lower"], "libs": ["string", "binascii", "random"], "doc": {"description": ["Generate a random hexadecimal string of a given length and then attempt to decode it in ASCII.", "The resulting ASCII string may contain non-printable characters", "or be shorter than the input length."], "note": [], "params": ["length (int): The length of the hexadecimal string."], "returns": ["str: The decoded ASCII string."], "reqs": ["binascii", "string", "random"], "raises": [], "example": [">>> random.seed(0)", ">>> f_834(6)", "'\\\\x18'", ">>> f_834(8)", "'\u01a4'"]}} -{"task_id": "f_791", "prompt": "import numpy as np\nfrom sklearn.preprocessing import MinMaxScaler\n\ndef f_791(rows=3, columns=2, seed=42):\n \"\"\"\n Generate a matrix of random values with specified dimensions and scale it between 0 and 1.\n \n Parameters:\n rows (int): The number of rows for the matrix. Default is 3.\n columns (int): The number of columns for the matrix. Default is 2.\n \n Returns:\n ndarray: A numpy ndarray with scaled values between 0 and 1.\n \n Requirements:\n - numpy\n - sklearn.preprocessing.MinMaxScaler\n \n Example:\n >>> f_791(3, 2)\n array([[0.37939383, 1. ],\n [1. , 0.55700635],\n [0. , 0. ]])\n \n >>> f_791(2, 2)\n array([[0., 1.],\n [1., 0.]])\n \"\"\"", "canonical_solution": " np.random.seed(seed) # Ensure reproducibility for consistent outputs across different runs\n matrix = np.random.rand(rows, columns)\n scaler = MinMaxScaler()\n scaled_matrix = scaler.fit_transform(matrix)\n\n return scaled_matrix", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n result = f_791()\n self.assertEqual(result.shape, (3, 2))\n self.assertTrue(np.all(result >= 0))\n \n def test_case_2(self):\n result = f_791(2, 2)\n self.assertEqual(result.shape, (2, 2))\n self.assertTrue(np.all(result >= 0) and np.all(result <= 1))\n \n def test_case_3(self):\n result = f_791(4, 3)\n self.assertEqual(result.shape, (4, 3))\n self.assertTrue(np.all(result >= 0) and np.all(result <= 1))\n \n def test_case_4(self):\n result = f_791(5, 1)\n self.assertEqual(result.shape, (5, 1))\n self.assertTrue(np.all(result >= 0))\n \n def test_case_5(self):\n result = f_791(1, 5)\n self.assertEqual(result.shape, (1, 5))\n self.assertTrue(np.all(result >= 0) and np.all(result <= 1))", "apis": ["numpy.random", "sklearn.preprocessing.MinMaxScaler", "numpy.random.rand", "numpy.random.seed"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Generate a matrix of random values with specified dimensions and scale it between 0 and 1.", ">>> f_791(2, 2)", "array([[0., 1.],", "[1., 0.]])"], "note": [], "params": ["rows (int): The number of rows for the matrix. Default is 3.", "columns (int): The number of columns for the matrix. Default is 2."], "returns": ["ndarray: A numpy ndarray with scaled values between 0 and 1."], "reqs": ["numpy", "sklearn.preprocessing.MinMaxScaler"], "raises": [], "example": [">>> f_791(3, 2)", "array([[0.37939383, 1. ],", "[1. , 0.55700635],", "[0. , 0. ]])"]}} -{"task_id": "f_394", "prompt": "from datetime import datetime, timedelta\nimport pytz\nimport calendar\n\n\ndef f_394(days_in_past=7):\n \"\"\"\n Get the weekday of the date 'days_in_past' days ago from today.\n\n This function computes the date that is 'days_in_past' number of days ago from the current\n system time's date in UTC. It then determines the weekday of this target date using calendar\n and returns its name as a string.\n\n Parameters:\n days_in_past (int): The number of days to go back from the current date to find the weekday.\n Defaults to 7 (one week ago). Must be a non-negative integer.\n\n Returns:\n weekday (str) : The name of the weekday (e.g., 'Monday', 'Tuesday') for the computed date.\n\n Requirements:\n - datetime.datetime\n - datetime.timedelta\n - pytz\n - calendar\n\n Example:\n >>> f_394()\n 'Monday'\n >>> f_394(3)\n 'Friday'\n \"\"\"", "canonical_solution": " if days_in_past < 0:\n raise ValueError(\"Days in the past cannot be negative\")\n\n date = datetime.now(pytz.UTC) - timedelta(days=days_in_past)\n weekday = calendar.day_name[date.weekday()]\n\n return weekday", "test": "import unittest\nfrom datetime import datetime, timedelta\nimport pytz\nimport calendar\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Input 1: Default input\n result = f_394()\n self.assertIsInstance(result, str)\n self.assertIn(result, list(calendar.day_name))\n # Ensure the result matches the expected output for 7 days ago\n expected_date = datetime.now(pytz.UTC) - timedelta(days=7)\n expected_weekday = calendar.day_name[expected_date.weekday()]\n self.assertEqual(result, expected_weekday)\n def test_case_2(self):\n # Input 2: Test with 3 days in the past\n result = f_394(3)\n self.assertIsInstance(result, str)\n self.assertIn(result, list(calendar.day_name))\n # Ensure the result matches the expected output for 3 days ago\n expected_date = datetime.now(pytz.UTC) - timedelta(days=3)\n expected_weekday = calendar.day_name[expected_date.weekday()]\n self.assertEqual(result, expected_weekday)\n def test_case_3(self):\n # Input 3: Test with 0 days in the past (today)\n result = f_394(0)\n self.assertIsInstance(result, str)\n self.assertIn(result, list(calendar.day_name))\n # Ensure the result matches the expected output for today\n expected_date = datetime.now(pytz.UTC)\n expected_weekday = calendar.day_name[expected_date.weekday()]\n self.assertEqual(result, expected_weekday)\n def test_case_4(self):\n # Input 4: Test with 30 days in the past (approximately a month ago)\n result = f_394(30)\n self.assertIsInstance(result, str)\n self.assertIn(result, list(calendar.day_name))\n # Ensure the result matches the expected output for 30 days ago\n expected_date = datetime.now(pytz.UTC) - timedelta(days=30)\n expected_weekday = calendar.day_name[expected_date.weekday()]\n self.assertEqual(result, expected_weekday)\n def test_case_5(self):\n # Input 5: Test handling invalid days_in_the_past\n for invalid in [-1, \"1\"]:\n with self.assertRaises(Exception):\n f_394(invalid)", "apis": ["calendar.day_name", "datetime.timedelta", "pytz.UTC", "datetime.datetime.now"], "libs": ["datetime", "calendar", "pytz"], "doc": {"description": ["Get the weekday of the date 'days_in_past' days ago from today.", "This function computes the date that is 'days_in_past' number of days ago from the current", "system time's date in UTC. It then determines the weekday of this target date using calendar", "and returns its name as a string."], "note": [], "params": ["days_in_past (int): The number of days to go back from the current date to find the weekday.", "Defaults to 7 (one week ago). Must be a non-negative integer."], "returns": ["weekday (str) : The name of the weekday (e.g., 'Monday', 'Tuesday') for the computed date."], "reqs": ["datetime.datetime", "datetime.timedelta", "pytz", "calendar"], "raises": [], "example": [">>> f_394()", "'Monday'", ">>> f_394(3)", "'Friday'"]}} -{"task_id": "f_928", "prompt": "import pandas as pd\nfrom sklearn.feature_selection import f_oneway\n\ndef f_928(data_file_path: str):\n \"\"\"\n Analyzes numerical data from a CSV file. The function reads the CSV file, converts string representations of\n numbers with commas into floating point numbers, calculates the mean and standard deviation for each numerical column,\n generates a histogram plot for each numerical column, and performs an ANOVA test to check the statistical significance \n of differences between means of numerical columns (if applicable).\n\n Parameters:\n - data_file_path (str): Path to the CSV data file.\n\n Returns:\n - means (pd.Series): Mean values of each numerical column.\n - std_devs (pd.Series): Standard deviation values of each numerical column.\n - axes (list[matplotlib.axes.Axes]): List of histogram plots for each numerical column.\n - anova_results (pd.DataFrame): ANOVA test results for each pair of numerical columns (if more than one numerical column is present).\n\n Requirements:\n - pandas\n - sklearn\n\n Note:\n - The function assumes that all columns in the CSV file contain numerical data or string representations of numerical data.\n - The ANOVA test is only performed if there are two or more numerical columns. Compute two columns \"F-value\" and \"P-value\" for each pair of numerical columns.\n\n Example:\n >>> means, std_devs, axes, anova_results = f_928('data.csv')\n >>> print(f'Means: {means}, Standard Deviations: {std_devs}')\n >>> print(anova_results)\n \"\"\"", "canonical_solution": " df = pd.read_csv(data_file_path)\n # Convert strings with commas to float, if applicable\n for col in df.columns:\n df[col] = pd.to_numeric(df[col].replace(\",\", \"\", regex=True), errors=\"coerce\")\n # drop columns with NaN values\n df = df.dropna(axis=1)\n means = df.mean()\n std_devs = df.std()\n\n # Creating a histogram for each numerical column\n axes = []\n for col in df.columns:\n ax = df[col].hist(bins=50)\n ax.set_title(col)\n axes.append(ax)\n\n plt.show()\n\n # ANOVA Test if more than one numerical column\n anova_results = None\n if len(df.columns) > 1:\n anova_results = pd.DataFrame(f_oneway(*[df[col] for col in df.columns if df[col].dtype != 'object']),\n index=['F-value', 'P-value'], \n columns=['ANOVA Results'])\n\n return means, std_devs, axes, anova_results", "test": "import unittest\nfrom unittest.mock import patch\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_928\"\"\"\n @patch(\"pandas.read_csv\")\n def test_empty_file(self, mock_read_csv):\n \"\"\"\n Test the function with an empty CSV file.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame()\n means, std_devs, axes, anova_results = f_928(\"empty.csv\")\n self.assertTrue(means.empty)\n self.assertTrue(std_devs.empty)\n self.assertEqual(len(axes), 0)\n self.assertIsNone(anova_results)\n @patch(\"pandas.read_csv\")\n def test_single_column(self, mock_read_csv):\n \"\"\"\n Test the function with a CSV file having a single numerical column.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"A\": [1, 2, 3, 4, 5]})\n means, std_devs, axes, anova_results = f_928(\"single_column.csv\")\n self.assertEqual(means[\"A\"], 3)\n self.assertAlmostEqual(std_devs[\"A\"], 1.5811, places=4)\n self.assertEqual(len(axes), 1)\n self.assertIsNone(anova_results)\n @patch(\"pandas.read_csv\")\n def test_multiple_columns(self, mock_read_csv):\n \"\"\"\n Test the function with a CSV file having multiple numerical columns.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n means, _, axes, anova_results = f_928(\"multiple_columns.csv\")\n self.assertEqual(means[\"A\"], 2)\n self.assertEqual(means[\"B\"], 5)\n self.assertEqual(len(axes), 2)\n self.assertEqual(anova_results[\"ANOVA Results\"][\"F-value\"], 13.5)\n self.assertAlmostEqual(anova_results[\"ANOVA Results\"][\"P-value\"], 0.021312, places=5)\n \n @patch(\"pandas.read_csv\")\n def test_numerical_and_non_numerical_columns(self, mock_read_csv):\n \"\"\"\n Test the function with a mix of numerical and non-numerical columns.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [\"a\", \"b\", \"c\"]})\n means, std_devs, axes, anova_results = f_928(\"mixed_columns.csv\")\n self.assertEqual(len(means), 1) # Only one numerical column\n self.assertEqual(len(std_devs), 1)\n self.assertEqual(len(axes), 1)\n self.assertIsNone(anova_results)\n @patch(\"pandas.read_csv\")\n def test_with_special_characters(self, mock_read_csv):\n \"\"\"\n Test the function with a CSV file containing numbers with special characters (e.g., commas).\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"A\": [\"1,000\", \"2,000\", \"3,000\"]})\n means, std_devs, axes, anova_results = f_928(\"special_characters.csv\")\n self.assertAlmostEqual(means[\"A\"], 2000, places=0)\n self.assertAlmostEqual(std_devs[\"A\"], pd.Series([1000, 2000, 3000]).std(), places=0)\n self.assertEqual(len(axes), 1)\n self.assertIsNone(anova_results)\n def tearDown(self):\n plt.close()", "apis": ["pandas.to_numeric", "sklearn.feature_selection.f_oneway", "pandas.read_csv", "pandas.DataFrame"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Analyzes numerical data from a CSV file. The function reads the CSV file, converts string representations of", "numbers with commas into floating point numbers, calculates the mean and standard deviation for each numerical column,", "generates a histogram plot for each numerical column, and performs an ANOVA test to check the statistical significance", "of differences between means of numerical columns (if applicable)."], "note": ["The function assumes that all columns in the CSV file contain numerical data or string representations of numerical data.", "The ANOVA test is only performed if there are two or more numerical columns. Compute two columns \"F-value\" and \"P-value\" for each pair of numerical columns."], "params": ["data_file_path (str): Path to the CSV data file."], "returns": ["means (pd.Series): Mean values of each numerical column.", "std_devs (pd.Series): Standard deviation values of each numerical column.", "axes (list[matplotlib.axes.Axes]): List of histogram plots for each numerical column.", "anova_results (pd.DataFrame): ANOVA test results for each pair of numerical columns (if more than one numerical column is present)."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> means, std_devs, axes, anova_results = f_928('data.csv')", ">>> print(f'Means: {means}, Standard Deviations: {std_devs}')", ">>> print(anova_results)"]}} -{"task_id": "f_794", "prompt": "import pandas as pd\nimport numpy as np\nimport random\nfrom random import randint, seed\n\n# Constants\nCATEGORIES = ['Electronics', 'Clothing', 'Home & Kitchen', 'Books', 'Toys & Games']\n\ndef f_794(mystrings, n_products, seed=0):\n \"\"\"\n Create a product catalog DataFrame where each row represents a product with the following columns:\n - 'Product Name': The name of the product with spaces replaced by underscores.\n - 'Category': The category to which the product belongs.\n - 'Price': The price of the product, generated randomly based on a normal distribution with a mean of 50 and a standard deviation of 10.\n \n Parameters:\n mystrings (list of str): List of product names.\n n_products (int): Number of products to generate in the catalog.\n\n Returns:\n pd.DataFrame: A pandas DataFrame containing the product catalog information.\n\n Requirements:\n - pandas\n - numpy\n - random.randint\n - random.seed\n\n Constants:\n - CATEGORIES: A list of categories used to randomly assign a category to each product.\n\n Examples:\n >>> f_794(['Mobile Phone', 'T Shirt', 'Coffee Maker', 'Python Book', 'Toy Car'], 2)\n Product Name Category Price\n 0 Python_Book Books 67.64\n 1 Mobile_Phone Home & Kitchen 54.00\n >>> f_794(['Laptop', 'Sweater'], 1)\n Product Name Category Price\n 0 Sweater Books 67.64\n \"\"\"", "canonical_solution": " catalogue_data = []\n random.seed(seed)\n np.random.seed(seed)\n for _ in range(n_products):\n product_name = mystrings[randint(0, len(mystrings) - 1)].replace(' ', '_')\n category = CATEGORIES[randint(0, len(CATEGORIES) - 1)]\n price = round(np.random.normal(50, 10), 2)\n catalogue_data.append([product_name, category, price])\n\n catalogue_df = pd.DataFrame(catalogue_data, columns=['Product Name', 'Category', 'Price'])\n\n return catalogue_df", "test": "import unittest\nfrom pandas.testing import assert_frame_equal\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n \n result = f_794(['Mobile Phone', 'T Shirt', 'Coffee Maker', 'Python Book', 'Toy Car'], 2, 42)\n # assert the value of the DataFrame\n self.assertEqual(result['Product Name'].tolist(), ['Mobile_Phone', 'Coffee_Maker'])\n self.assertEqual(result['Category'].tolist(), ['Electronics', 'Clothing'])\n self.assertEqual(result['Price'].tolist(), [54.97, 48.62])\n \n def test_case_2(self):\n result = f_794(['Laptop', 'Sweater'], 1)\n self.assertEqual(result['Product Name'].tolist(), ['Sweater'])\n self.assertEqual(result['Category'].tolist(), ['Books'])\n self.assertEqual(result['Price'].tolist(), [67.64])\n \n def test_case_3(self):\n result = f_794(['Book', 'Pen', 'Bag'], 3)\n self.assertEqual(result['Product Name'].tolist(), ['Pen', 'Book', 'Bag'])\n self.assertEqual(result['Category'].tolist(), ['Books', 'Home & Kitchen', 'Books'])\n self.assertEqual(result['Price'].tolist(), [67.64, 54.00, 59.79])\n \n def test_case_4(self):\n result = f_794(['Watch'], 2)\n self.assertEqual(result['Product Name'].tolist(), ['Watch', 'Watch'])\n self.assertEqual(result['Category'].tolist(), ['Books', 'Home & Kitchen'])\n self.assertEqual(result['Price'].tolist(), [67.64, 54.00])\n def test_case_5(self):\n result = f_794(['TV', 'Fridge', 'Sofa', 'Table'], 0)\n self.assertEqual(result.empty, True)", "apis": ["random.randint", "random.seed", "numpy.random", "numpy.random.seed", "numpy.random.normal", "pandas.DataFrame"], "libs": ["numpy", "pandas", "random"], "doc": {"description": ["Create a product catalog DataFrame where each row represents a product with the following columns:", "- 'Product Name': The name of the product with spaces replaced by underscores.", "- 'Category': The category to which the product belongs.", "- 'Price': The price of the product, generated randomly based on a normal distribution with a mean of 50 and a standard deviation of 10.", "Constants:", "- CATEGORIES: A list of categories used to randomly assign a category to each product."], "note": [], "params": ["mystrings (list of str): List of product names.", "n_products (int): Number of products to generate in the catalog."], "returns": ["pd.DataFrame: A pandas DataFrame containing the product catalog information."], "reqs": ["pandas", "numpy", "random.randint", "random.seed"], "raises": [], "example": ["Examples:", ">>> f_794(['Mobile Phone', 'T Shirt', 'Coffee Maker', 'Python Book', 'Toy Car'], 2)", "Product Name Category Price", "0 Python_Book Books 67.64", "1 Mobile_Phone Home & Kitchen 54.00", ">>> f_794(['Laptop', 'Sweater'], 1)", "Product Name Category Price", "0 Sweater Books 67.64"]}} -{"task_id": "f_335", "prompt": "import pandas as pd\nimport seaborn as sns\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_335(df1, df2):\n \"\"\"\n Merge two dataframes on the 'id' column and then scale the numeric features.\n\n This function merges two dataframes via outer join on the 'id' column, and scales the merged dataframe's\n numeric features from df1 to have a mean of 0 and standard deviation of 1. It also returns a pair plot of\n the scaled features from df1.\n\n Parameters:\n - df1 (pd.DataFrame): Left dataframe to merge into.\n - df2 (pd.DataFrame): Right dataframe to merge from.\n\n Returns:\n - merged_df (pd.DataFrame): The partially scaled and merged dataframe.\n - pair_plot (seaborn.axisgrid.PairGrid): Pair plot of the scaled dataframe.\n\n Requirements:\n - pandas\n - sklearn\n - seaborn\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7]})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature4': [4.5, 6.7, 8.9], 'feature5': [5.6, 7.8, 9.0]})\n >>> scaled_df, plot = f_335(df1, df2)\n >>> scaled_df\n id feature1 feature2 feature4 feature5\n 0 1 -1.224745 -1.224745 4.5 5.6\n 1 2 0.000000 0.000000 6.7 7.8\n 2 3 1.224745 1.224745 8.9 9.0\n >>> type(scaled_df)\n \n >>> type(plot)\n \n \"\"\"", "canonical_solution": " merged_df = pd.merge(df1, df2, on=\"id\", how=\"outer\")\n\n # Select only numeric columns from df1 (excluding 'id')\n numeric_features_df1 = df1.select_dtypes(\n include=[\"float64\", \"int64\"]\n ).columns.tolist()\n if \"id\" in numeric_features_df1:\n numeric_features_df1.remove(\"id\")\n\n # Scale only the numeric features of df1\n if not merged_df.empty and numeric_features_df1:\n scaler = StandardScaler()\n merged_df[numeric_features_df1] = scaler.fit_transform(\n merged_df[numeric_features_df1]\n )\n\n # Pair plot only for the numeric features of df1\n pair_plot = None\n if numeric_features_df1:\n pair_plot = sns.pairplot(merged_df[numeric_features_df1])\n\n return merged_df, pair_plot", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Standard data merging on 'id' and checking scaled values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1.2, 3.4, 5.6],\n \"feature2\": [2.3, 4.5, 6.7],\n \"feature3\": [3.4, 5.6, 7.8],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3], \"feature4\": [4.5, 6.7, 8.9], \"feature5\": [5.6, 7.8, 9.0]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertEqual(\n list(scaled_df.columns),\n [\"id\", \"feature1\", \"feature2\", \"feature3\", \"feature4\", \"feature5\"],\n )\n self.assertAlmostEqual(scaled_df[\"feature1\"].mean(), 0, places=5)\n def test_case_2(self):\n # Random data merging and checking scaled values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 3, 5],\n \"feature1\": [10, 20, 30],\n \"feature2\": [5, 15, 25],\n \"feature3\": [6, 16, 26],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 5, 3], \"feature4\": [7, 17, 27], \"feature5\": [8, 18, 28]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertAlmostEqual(scaled_df[\"feature2\"].std(), 1.224745, places=5)\n def test_case_3(self):\n # Negative values and merging on 'id' and checking scaled values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [-1, -2, -3],\n \"feature2\": [-5, -6, -7],\n \"feature3\": [-8, -9, -10],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3], \"feature4\": [-11, -12, -13], \"feature5\": [-14, -15, -16]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertAlmostEqual(scaled_df[\"feature3\"].max(), 1.224745, places=5)\n def test_case_4(self):\n # Zero values and checking if scaled values remain zero\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3, 4],\n \"feature1\": [0, 0, 0, 0],\n \"feature2\": [0, 0, 0, 0],\n \"feature3\": [0, 0, 0, 0],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3, 4], \"feature4\": [0, 0, 0, 0], \"feature5\": [0, 0, 0, 0]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertAlmostEqual(scaled_df[\"feature1\"].min(), 0, places=5)\n def test_case_5(self):\n # Large values and checking scaled min values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2],\n \"feature1\": [1000, 2000],\n \"feature2\": [500, 1500],\n \"feature3\": [100, 200],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2], \"feature4\": [10, 20], \"feature5\": [1, 2]})\n scaled_df, _ = f_335(df1, df2)\n self.assertAlmostEqual(scaled_df[\"feature2\"].min(), -1, places=5)\n def test_case_6(self):\n # Testing the plot's attributes\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, 3],\n \"feature2\": [4, 5, 6],\n \"feature3\": [7, 8, 9],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3], \"feature4\": [10, 11, 12], \"feature5\": [13, 14, 15]}\n )\n _, pair_plot = f_335(df1, df2)\n # Checking if the pair plot has the expected attributes\n self.assertEqual(\n len(pair_plot.axes), 3\n ) # Because we have 3 valid features in df1\n self.assertIn(\"feature1\", pair_plot.data.columns)\n self.assertIn(\"feature2\", pair_plot.data.columns)\n self.assertIn(\"feature3\", pair_plot.data.columns)\n def test_case_7(self):\n # Testing with empty dataframes\n df1 = pd.DataFrame(columns=[\"id\", \"feature1\", \"feature2\", \"feature3\"])\n df2 = pd.DataFrame(columns=[\"id\", \"feature4\", \"feature5\"])\n scaled_df, _ = f_335(df1, df2)\n self.assertTrue(scaled_df.empty)\n def test_case_8(self):\n # Testing with NaN values in the dataframes\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, None],\n \"feature2\": [4, None, 6],\n \"feature3\": [7, 8, 9],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3], \"feature4\": [10, 11, 12], \"feature5\": [13, 14, 15]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertTrue(scaled_df.isnull().any().any()) # Checking if NaN values exist\n def tearDown(self):\n plt.close(\"all\")", "apis": ["seaborn.pairplot", "pandas.merge", "sklearn.preprocessing.StandardScaler"], "libs": ["pandas", "sklearn", "seaborn"], "doc": {"description": ["Merge two dataframes on the 'id' column and then scale the numeric features.", "This function merges two dataframes via outer join on the 'id' column, and scales the merged dataframe's", "numeric features from df1 to have a mean of 0 and standard deviation of 1. It also returns a pair plot of", "the scaled features from df1."], "note": [], "params": ["df1 (pd.DataFrame): Left dataframe to merge into.", "df2 (pd.DataFrame): Right dataframe to merge from."], "returns": ["merged_df (pd.DataFrame): The partially scaled and merged dataframe.", "pair_plot (seaborn.axisgrid.PairGrid): Pair plot of the scaled dataframe."], "reqs": ["pandas", "sklearn", "seaborn"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7]})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature4': [4.5, 6.7, 8.9], 'feature5': [5.6, 7.8, 9.0]})", ">>> scaled_df, plot = f_335(df1, df2)", ">>> scaled_df", "id feature1 feature2 feature4 feature5", "0 1 -1.224745 -1.224745 4.5 5.6", "1 2 0.000000 0.000000 6.7 7.8", "2 3 1.224745 1.224745 8.9 9.0", ">>> type(scaled_df)", "", ">>> type(plot)", ""]}} -{"task_id": "f_536", "prompt": "import os\nimport random\n\ndef f_536(directory, n_files):\n \"\"\"\n Create n random text files in a specific directory, write a random string to each file, and then reset the cursor to the beginning of each file.\n\n Parameters:\n - directory (str): The directory in which to generate the files.\n - n_files (int): The number of files to generate.\n\n Returns:\n - directory (str): The directory in which the files were generated.\n\n Requirements:\n - os\n - random\n\n Example:\n >>> f_536('/path/to/directory', 5)\n '/path/to/directory'\n \"\"\"", "canonical_solution": " if not os.path.exists(directory):\n os.makedirs(directory)\n\n for i in range(n_files):\n filename = os.path.join(directory, f\"file_{i+1}.txt\")\n\n with open(filename, 'w') as file:\n file.write(str(random.randint(1, 100)))\n file.seek(0)\n\n return directory", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def setUp(self):\n random.seed(42)\n \n def tearDown(self):\n shutil.rmtree('./source', ignore_errors=True)\n shutil.rmtree('./src', ignore_errors=True)\n shutil.rmtree('./s', ignore_errors=True)\n \n def test_case_1(self):\n directory = f_536('./source', 10)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 10)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt')\n \n def test_case_2(self):\n directory = f_536('./src', 1)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 1)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt') \n \n def test_case_3(self):\n directory = f_536('./s', 100)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 100)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt') \n \n def test_case_4(self):\n directory = f_536('./s', 0)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 0)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt') \n \n def test_case_5(self):\n directory = f_536('./source', 1)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 1)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt')", "apis": ["os.path.exists", "random.randint", "os.path", "os.makedirs", "os.path.join"], "libs": ["random", "os"], "doc": {"description": ["Create n random text files in a specific directory, write a random string to each file, and then reset the cursor to the beginning of each file."], "note": [], "params": ["directory (str): The directory in which to generate the files.", "n_files (int): The number of files to generate."], "returns": ["directory (str): The directory in which the files were generated."], "reqs": ["os", "random"], "raises": [], "example": [">>> f_536('/path/to/directory', 5)", "'/path/to/directory'"]}} -{"task_id": "f_790", "prompt": "import numpy as np\nimport random\nfrom datetime import datetime\n\ndef f_790(rows=3, columns=2, start_date=datetime(2021, 1, 1), end_date=datetime(2021, 12, 31), seed=0):\n \"\"\"\n Generate a matrix with unique dates between a given start and end date.\n \n Functionality:\n This function generates a matrix of given dimensions (rows x columns) containing unique dates between \n a specified start date and end date.\n \n Input:\n - rows (int): The number of rows for the output matrix. Default is 3.\n - columns (int): The number of columns for the output matrix. Default is 2.\n - start_date (datetime): The start date for the range of unique dates. Default is datetime(2021, 1, 1).\n - end_date (datetime): The end date for the range of unique dates. Default is datetime(2021, 12, 31).\n \n Output to be returned:\n - ndarray: A numpy ndarray with unique dates in the shape (rows, columns).\n \n Requirements:\n - numpy\n - itertools\n - datetime\n - random\n \n Example:\n >>> matrix = f_790(2, 2, datetime(2021, 1, 1), datetime(2021, 1, 10))\n >>> print(matrix)\n [['2021-01-03T00:00:00.000000000', '2021-01-07T00:00:00.000000000'],\n ['2021-01-09T00:00:00.000000000', '2021-01-04T00:00:00.000000000']]\n \"\"\"", "canonical_solution": " # Convert start_date and end_date to numpy datetime64 objects\n if seed is not None:\n random.seed(seed)\n \n # Convert start_date and end_date to numpy datetime64 objects\n start_date_np = np.datetime64(start_date)\n end_date_np = np.datetime64(end_date)\n\n # Calculate the number of days between start_date and end_date\n total_days = int((end_date_np - start_date_np).astype('timedelta64[D]').astype(int) + 1)\n\n # Randomly select unique dates within the range without replacement using random.sample\n selected_dates = sorted(random.sample(range(total_days), rows * columns))\n\n # Generate the matrix with selected unique dates\n matrix = (start_date_np + np.array(selected_dates).astype('timedelta64[D]')).reshape(rows, columns)\n\n return matrix", "test": "# Unit testing\nimport unittest\nimport numpy.testing as npt\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n # Using default parameters\n matrix = f_790(seed=0)\n self.assertEqual(matrix.shape, (3, 2))\n self.assertTrue(np.all(np.diff(matrix.ravel()).astype(int) > 0)) # Dates should be unique\n def test_case_2(self):\n # Using custom rows and columns, and a small date range\n matrix = f_790(2, 2, datetime(2021, 1, 1), datetime(2021, 1, 10), seed=42)\n self.assertEqual(matrix.shape, (2, 2))\n self.assertTrue(np.all(np.diff(matrix.ravel()).astype(int) >= 0)) # Dates should be unique\n def test_case_3(self):\n # Using custom rows and columns, and a large date range\n matrix = f_790(4, 4, datetime(2000, 1, 1), datetime(2021, 12, 31), seed=55)\n self.assertEqual(matrix.shape, (4, 4))\n self.assertTrue(np.all(np.diff(matrix.ravel()).astype(int) >= 0)) # Dates should be unique\n def test_case_4(self):\n # Using a date range of one day\n matrix = f_790(1, 1, datetime(2021, 1, 1), datetime(2021, 1, 1), seed=0)\n expected_date = np.array(['2021-01-01'], dtype='datetime64[us]').reshape(1, 1)\n npt.assert_array_equal(matrix, expected_date) # Only one date in the range\n def test_case_5(self):\n # Using custom rows and columns, and a date range with only two days\n matrix = f_790(1, 2, datetime(2021, 1, 1), datetime(2021, 1, 2), seed=41)\n self.assertEqual(matrix.shape, (1, 2))\n self.assertTrue(np.all(np.diff(matrix.ravel()).astype(int) >= 0)) # Dates should be unique\n expected_dates = np.array(['2021-01-01', '2021-01-02'], dtype='datetime64[us]').reshape(1, 2)\n for date in expected_dates.ravel():\n self.assertIn(date, matrix.ravel())", "apis": ["random.seed", "random.sample", "numpy.array", "datetime.datetime", "numpy.datetime64"], "libs": ["numpy", "random", "datetime"], "doc": {"description": ["Generate a matrix with unique dates between a given start and end date.", "Functionality:", "This function generates a matrix of given dimensions (rows x columns) containing unique dates between", "a specified start date and end date.", "Input:", "- rows (int): The number of rows for the output matrix. Default is 3.", "- columns (int): The number of columns for the output matrix. Default is 2.", "- start_date (datetime): The start date for the range of unique dates. Default is datetime(2021, 1, 1).", "- end_date (datetime): The end date for the range of unique dates. Default is datetime(2021, 12, 31).", "Output to be returned:", "- ndarray: A numpy ndarray with unique dates in the shape (rows, columns)."], "note": [], "params": [], "returns": [], "reqs": ["numpy", "itertools", "datetime", "random"], "raises": [], "example": [">>> matrix = f_790(2, 2, datetime(2021, 1, 1), datetime(2021, 1, 10))", ">>> print(matrix)", "[['2021-01-03T00:00:00.000000000', '2021-01-07T00:00:00.000000000'],", "['2021-01-09T00:00:00.000000000', '2021-01-04T00:00:00.000000000']]"]}} -{"task_id": "f_608", "prompt": "import json\nimport csv\nimport os\nimport base64\n\ndef f_608(raw_string, filename, output_dir):\n \"\"\"\n Processes a base64-encoded JSON string, stores the data in a CSV file, and returns the path of the file.\n\n Parameters:\n - raw_string (str): The base64 encoded JSON string.\n - filename (str): The name of the file to which the data should be saved (without extension).\n - output_dir (str): The path of the directory in which the file should be saved.\n\n Returns:\n - file_path (str): The path of the file.\n\n Requirements:\n - json\n - csv\n - os\n - base64\n\n Example:\n >>> f_608('eyJrZXkiOiAiVmFsdWUifQ==', 'data', './output')\n './output/data.csv'\n \"\"\"", "canonical_solution": " # Decode the string and load the data\n decoded_string = base64.b64decode(raw_string).decode('utf-8')\n data = json.loads(decoded_string)\n\n # Prepare the output directory\n os.makedirs(output_dir, exist_ok=True)\n\n # Prepare the file path\n file_path = os.path.join(output_dir, f'{filename}.csv')\n\n # Save the data to the file\n with open(file_path, 'w', newline='') as f:\n writer = csv.writer(f)\n for key, value in data.items():\n writer.writerow([key, value])\n\n return file_path", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n if os.path.exists('./output'):\n shutil.rmtree('./output')\n \n def test_case_1(self):\n raw_string = 'eyJrZXkiOiAiVmFsdWUifQ=='\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,Value\\n')\n os.remove(expected)\n \n def test_case_2(self):\n string_before = \"\"\"{\"key\": \"hello\"}\"\"\"\n raw_string = base64.b64encode(string_before.encode('utf-8')).decode('utf-8')\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,hello\\n')\n os.remove(expected)\n def test_case_3(self):\n string_before = \"\"\"{\"key\": \"hello\", \"key2\": \"world\"}\"\"\"\n raw_string = base64.b64encode(string_before.encode('utf-8')).decode('utf-8')\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,hello\\nkey2,world\\n')\n os.remove(expected)\n def test_case_4(self):\n string_before = \"\"\"{\"key\": \"hello\", \"key2\": \"world\", \"key3\": \"!\"}\"\"\"\n raw_string = base64.b64encode(string_before.encode('utf-8')).decode('utf-8')\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,hello\\nkey2,world\\nkey3,!\\n')\n os.remove(expected)\n def test_case_5(self):\n string_before = \"\"\"{\"key\": \"hello\", \"key2\": \"world\", \"key3\": \"!\", \"key4\": \"test\"}\"\"\"\n raw_string = base64.b64encode(string_before.encode('utf-8')).decode('utf-8')\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,hello\\nkey2,world\\nkey3,!\\nkey4,test\\n')\n os.remove(expected)", "apis": ["os.path", "json.loads", "csv.writer", "os.makedirs", "base64.b64decode", "os.path.join"], "libs": ["csv", "base64", "json", "os"], "doc": {"description": ["Processes a base64-encoded JSON string, stores the data in a CSV file, and returns the path of the file."], "note": [], "params": ["raw_string (str): The base64 encoded JSON string.", "filename (str): The name of the file to which the data should be saved (without extension).", "output_dir (str): The path of the directory in which the file should be saved."], "returns": ["file_path (str): The path of the file."], "reqs": ["json", "csv", "os", "base64"], "raises": [], "example": [">>> f_608('eyJrZXkiOiAiVmFsdWUifQ==', 'data', './output')", "'./output/data.csv'"]}} -{"task_id": "f_804", "prompt": "import os\nimport glob\nfrom collections import Counter\n\n\ndef f_804(directory, extensions=[\".txt\", \".docx\", \".xlsx\", \".csv\"], keep_zero=True):\n \"\"\"\n Traverses a given directory recursively to count files by specified extensions.\n\n Parameters:\n - directory (str): The path of the directory to search.\n - extensions (list of str): File extensions to count. Defaults to ['.txt', '.docx', '.xlsx', '.csv'].\n - keep_zero (bool): Whether to include extensions with zero counts. Defaults to True.\n\n Returns:\n - Counter: An object containing counts of files for each of the specified extensions.\n\n Raises:\n - OSError: If the specified directory does not exist.\n\n Requirements:\n - os\n - glob\n - collections\n\n Note:\n - This function counts files in a case-sensitive manner.\n\n Examples:\n >>> f_804('/path/to/documents')\n Counter({'.txt': 5, '.docx': 2, '.xlsx': 1, '.csv': 0})\n >>> f_804('/path/to/documents', keep_zero=False)\n Counter({'.txt': 5, '.docx': 2, '.xlsx': 1})\n >>> f_804('/path/to/documents', extensions=['.txt'], keep_zero=False)\n Counter({'.txt': 5})\n \"\"\"", "canonical_solution": " if not os.path.exists(directory):\n raise OSError(\"directory must exist.\")\n\n counter = Counter()\n\n for suffix in extensions:\n count = len(\n glob.glob(os.path.join(directory, \"**\", \"*\" + suffix), recursive=True)\n )\n if count:\n counter[suffix] += count\n else:\n if keep_zero:\n counter[suffix] += count\n return counter", "test": "import unittest\nfrom collections import Counter\nfrom tempfile import TemporaryDirectory\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = TemporaryDirectory()\n def tearDown(self):\n self.temp_dir.cleanup()\n def create_test_files(self, directory, file_list):\n for file_name in file_list:\n with open(os.path.join(directory, file_name), \"w\") as f:\n f.write(\"Test\")\n def test_case_1(self):\n # Test basic case with default extensions\n file_names = [\"file1.txt\", \"file2.docx\", \"file3.xlsx\", \"file4.csv\", \"file5.txt\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(self.temp_dir.name)\n expected = Counter({\".txt\": 2, \".docx\": 1, \".xlsx\": 1, \".csv\": 1})\n self.assertEqual(result, expected)\n def test_case_2(self):\n # Test empty directory\n result = f_804(self.temp_dir.name)\n expected = Counter({\".txt\": 0, \".docx\": 0, \".xlsx\": 0, \".csv\": 0})\n self.assertEqual(result, expected)\n def test_case_3(self):\n # Test error handling - non-existent directory\n with self.assertRaises(OSError):\n f_804(\"/path/to/nonexistent/directory\")\n def test_case_4(self):\n # Test ignoring unspecified extensions\n file_names = [\"file1.pdf\", \"file2.png\", \"file3.txt\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(self.temp_dir.name)\n expected = Counter({\".txt\": 1, \".docx\": 0, \".xlsx\": 0, \".csv\": 0})\n self.assertEqual(result, expected)\n def test_case_5(self):\n # Test nested folders\n nested_dir_path = os.path.join(self.temp_dir.name, \"nested\")\n os.makedirs(nested_dir_path)\n file_names = [\"nested_file1.txt\", \"nested_file2.xlsx\"]\n self.create_test_files(nested_dir_path, file_names)\n result = f_804(self.temp_dir.name)\n expected = Counter({\".txt\": 1, \".xlsx\": 1, \".docx\": 0, \".csv\": 0})\n self.assertEqual(result, expected)\n def test_case_6(self):\n # Test custom extensions\n file_names = [\"image.jpeg\", \"video.mp4\", \"document.pdf\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(\n self.temp_dir.name, extensions=[\".jpeg\", \".mp4\"], keep_zero=False\n )\n expected = Counter({\".jpeg\": 1, \".mp4\": 1})\n self.assertEqual(result, expected)\n def test_case_7(self):\n # Test custom extensions\n file_names = [\"file1.txt\", \"file2.docx\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(self.temp_dir.name, keep_zero=False)\n expected = Counter(\n {\".txt\": 1, \".docx\": 1}\n ) # .xlsx and .csv are omitted because their count is 0 and keep_zero is False\n self.assertEqual(result, expected)\n def test_case_8(self):\n # Test case sensitivity\n file_names = [\"file1.txt\", \"file1.tXt\", \"fiLE.txt\", \"fiLE.TXt\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(self.temp_dir.name, extensions=[\".txt\"])\n expected = Counter({\".txt\": 2})\n self.assertEqual(result, expected)", "apis": ["os.path.exists", "collections.Counter", "os.path", "os.path.join", "glob.glob"], "libs": ["collections", "glob", "os"], "doc": {"description": ["Traverses a given directory recursively to count files by specified extensions."], "note": ["This function counts files in a case-sensitive manner."], "params": ["directory (str): The path of the directory to search.", "extensions (list of str): File extensions to count. Defaults to ['.txt', '.docx', '.xlsx', '.csv'].", "keep_zero (bool): Whether to include extensions with zero counts. Defaults to True."], "returns": ["Counter: An object containing counts of files for each of the specified extensions."], "reqs": ["os", "glob", "collections"], "raises": ["OSError: If the specified directory does not exist."], "example": ["Examples:", ">>> f_804('/path/to/documents')", "Counter({'.txt': 5, '.docx': 2, '.xlsx': 1, '.csv': 0})", ">>> f_804('/path/to/documents', keep_zero=False)", "Counter({'.txt': 5, '.docx': 2, '.xlsx': 1})", ">>> f_804('/path/to/documents', extensions=['.txt'], keep_zero=False)", "Counter({'.txt': 5})"]}} -{"task_id": "f_560", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.cluster import KMeans\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_560(df):\n \"\"\"\n Given a pandas DataFrame with random numeric values, run KMeans clusters on the data and return the labels.\n\n Parameters:\n - df (DataFrame): The DataFrame to use.\n\n Returns:\n - labels (np.array): The labels from the KMeans clustering.\n\n Requirements:\n - pandas\n - numpy\n - sklearn\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.rand(500, 2) * 100, columns=['A', 'B']) \n >>> labels = f_560(df)\n >>> print(labels)\n [0 2 1 0 2 0 2 1 0 1 1 1 0 0 1 1 0 2 1 2 0 0 0 0 1 2 2 2 1 1 1 2 0 0 0 1 0\n 2 1 1 2 1 1 2 2 0 2 2 1 1 0 0 2 0 1 1 2 2 1 2 2 1 1 2 0 1 1 2 2 0 2 1 1 2\n 1 2 0 2 2 0 0 2 0 1 0 1 1 1 2 2 1 2 0 2 1 0 2 1 2 2 1 0 1 0 1 2 1 1 0 2 2\n 1 1 2 2 2 2 0 1 1 2 2 0 0 2 1 2 0 2 1 2 0 2 2 1 2 2 2 2 2 2 1 1 0 0 1 2 0\n 1 1 0 2 2 1 2 1 0 2 1 1 2 1 2 2 1 0 1 1 2 1 1 1 0 1 0 0 1 0 0 2 0 0 2 2 1\n 1 0 1 1 2 0 2 2 1 2 2 0 0 2 2 0 0 0 1 1 0 2 2 1 2 2 0 0 0 1 0 1 0 0 1 0 1\n 2 2 1 2 0 0 0 1 0 2 2 0 0 0 0 0 0 2 2 0 2 1 2 0 1 1 1 2 2 0 1 2 2 2 2 1 0\n 2 1 2 2 1 0 2 2 2 2 1 2 0 1 0 0 0 2 2 1 2 1 1 0 1 2 0 0 2 0 1 0 1 1 1 1 0\n 1 2 1 1 1 1 0 1 0 0 1 2 1 2 1 1 1 0 1 2 2 0 1 1 1 1 0 2 2 0 2 1 1 2 0 1 1\n 1 1 0 0 0 1 2 2 0 2 1 1 1 1 0 0 0 1 1 0 0 0 2 1 0 2 0 2 0 2 0 1 0 2 0 0 1\n 1 2 0 0 2 0 1 0 2 2 1 0 0 2 0 0 1 1 0 2 2 1 0 1 0 0 2 0 2 2 1 2 0 2 1 2 0\n 2 1 1 1 1 0 1 2 1 1 1 2 2 0 0 1 0 2 0 0 1 0 1 2 1 0 1 2 1 2 1 2 1 0 1 1 1\n 1 2 2 1 0 1 1 0 0 2 1 1 2 1 0 1 2 2 1 0 1 0 2 1 0 0 0 2 1 0 2 2 0 1 1 0 0\n 1 1 2 2 2 1 1 1 2 0 1 2 2 0 2 0 1 2 2]\n \"\"\"", "canonical_solution": " # Perform clustering\n scaler = StandardScaler()\n df_std = scaler.fit_transform(df.values)\n \n # Convert standardized values back to a DataFrame using pd\n df_std = pd.DataFrame(df_std, columns=df.columns)\n \n # Perform clustering with sklearn's KMeans\n kmeans = KMeans(n_clusters=3, random_state=0).fit(df_std)\n labels = kmeans.labels_ # The labels are directly a numpy array\n \n return labels", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.rand(500, 2) * 100, columns=['A', 'B'])\n labels = f_560(df)\n self.assertEqual(len(labels), 500)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))\n def test_case_2(self):\n df = pd.DataFrame(np.random.rand(10, 2) * 100, columns=['A', 'B'])\n labels = f_560(df)\n self.assertEqual(len(labels), 10)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))\n def test_case_3(self):\n df = pd.DataFrame(np.random.rand(5, 4) * 100, columns=['A', 'B', 'C', 'D'])\n labels = f_560(df)\n self.assertEqual(len(labels), 5)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))\n def test_case_4(self):\n df = pd.DataFrame(np.random.rand(20, 3) * 100, columns=['A', 'B', 'C'])\n labels = f_560(df)\n self.assertEqual(len(labels), 20)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))\n def test_case_5(self):\n df = pd.DataFrame(np.random.rand(42, 1) * 100, columns=['A'])\n labels = f_560(df)\n self.assertEqual(len(labels), 42)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))", "apis": ["sklearn.cluster.KMeans", "pandas.DataFrame", "sklearn.preprocessing.StandardScaler"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Given a pandas DataFrame with random numeric values, run KMeans clusters on the data and return the labels."], "note": [], "params": ["df (DataFrame): The DataFrame to use."], "returns": ["labels (np.array): The labels from the KMeans clustering."], "reqs": ["pandas", "numpy", "sklearn"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.rand(500, 2) * 100, columns=['A', 'B'])", ">>> labels = f_560(df)", ">>> print(labels)", "[0 2 1 0 2 0 2 1 0 1 1 1 0 0 1 1 0 2 1 2 0 0 0 0 1 2 2 2 1 1 1 2 0 0 0 1 0", "2 1 1 2 1 1 2 2 0 2 2 1 1 0 0 2 0 1 1 2 2 1 2 2 1 1 2 0 1 1 2 2 0 2 1 1 2", "1 2 0 2 2 0 0 2 0 1 0 1 1 1 2 2 1 2 0 2 1 0 2 1 2 2 1 0 1 0 1 2 1 1 0 2 2", "1 1 2 2 2 2 0 1 1 2 2 0 0 2 1 2 0 2 1 2 0 2 2 1 2 2 2 2 2 2 1 1 0 0 1 2 0", "1 1 0 2 2 1 2 1 0 2 1 1 2 1 2 2 1 0 1 1 2 1 1 1 0 1 0 0 1 0 0 2 0 0 2 2 1", "1 0 1 1 2 0 2 2 1 2 2 0 0 2 2 0 0 0 1 1 0 2 2 1 2 2 0 0 0 1 0 1 0 0 1 0 1", "2 2 1 2 0 0 0 1 0 2 2 0 0 0 0 0 0 2 2 0 2 1 2 0 1 1 1 2 2 0 1 2 2 2 2 1 0", "2 1 2 2 1 0 2 2 2 2 1 2 0 1 0 0 0 2 2 1 2 1 1 0 1 2 0 0 2 0 1 0 1 1 1 1 0", "1 2 1 1 1 1 0 1 0 0 1 2 1 2 1 1 1 0 1 2 2 0 1 1 1 1 0 2 2 0 2 1 1 2 0 1 1", "1 1 0 0 0 1 2 2 0 2 1 1 1 1 0 0 0 1 1 0 0 0 2 1 0 2 0 2 0 2 0 1 0 2 0 0 1", "1 2 0 0 2 0 1 0 2 2 1 0 0 2 0 0 1 1 0 2 2 1 0 1 0 0 2 0 2 2 1 2 0 2 1 2 0", "2 1 1 1 1 0 1 2 1 1 1 2 2 0 0 1 0 2 0 0 1 0 1 2 1 0 1 2 1 2 1 2 1 0 1 1 1", "1 2 2 1 0 1 1 0 0 2 1 1 2 1 0 1 2 2 1 0 1 0 2 1 0 0 0 2 1 0 2 2 0 1 1 0 0", "1 1 2 2 2 1 1 1 2 0 1 2 2 0 2 0 1 2 2]"]}} -{"task_id": "f_837", "prompt": "import requests\nimport pandas as pd\nfrom bs4 import BeautifulSoup\n\n\ndef f_837(url: str, csv_file_path: str) -> list:\n \"\"\"\n Extracts title, date, and author information from a webpage and writes the data to a CSV file.\n\n The function iterates through each 'div' element with a class 'container', extracting the text of 'h1', and 'span' elements with classes \n 'date' and 'author', respectively. Default values ('No Title', 'No Date', or 'No Author') are used if an element is \n not found. The extracted data is stored in a list of tuples.\n\n The list of tuples is then converted into a Pandas DataFrame and saved to a CSV file at the specified file path. \n The DataFrame's columns are labeled as 'Title', 'Date', and 'Author'. The function returns the list of tuples.\n\n Raises:\n - RuntimeError: If the URL is incorrect or the server is down, the error message might be \"Error fetching URL: HTTP Error 404: Not Found\" \n or \"Error fetching URL: ConnectionError\". The function begins by making an HTTP request to the specified URL. It sets a timeout of 5 seconds to avoid \n prolonged waiting in case of unresponsive webpages. If the request encounters any exceptions such as connection errors, timeouts, or HTTP errors, a 'requests.RequestException' is raised. \n The function raises a '' with a message that includes the details of the exception. For example,, depending on the specific issue encountered.\n Parameters:\n\n Parameters:\n - url (str): The URL of the webpage to be parsed.\n - csv_file_path (str): The path where the resulting CSV file will be saved.\n\n Returns:\n list: A list of tuples containing the (title, date, author) extracted from the webpage. Default placeholders \n are used for missing information.\n\n Requirements:\n - requests\n - bs4\n - pandas\n\n Example:\n >>> data = f_837('https://example.com/articles', '/path/to/save/csv/file.csv')\n >>> type(data)\n \n >>> len(data) > 0\n True\n \"\"\"", "canonical_solution": "\n\n try:\n response = requests.get(url, timeout=5)\n response.raise_for_status()\n except requests.RequestException as e:\n raise RuntimeError(f\"Error fetching URL: {e}\")\n\n soup = BeautifulSoup(response.text, \"html.parser\")\n data = []\n for div in soup.find_all(\"div\", class_=\"container\"):\n title = div.find(\"h1\").text.strip() if div.find(\"h1\") else \"No Title\"\n date = (\n div.find(\"span\", class_=\"date\").text.strip()\n if div.find(\"span\", class_=\"date\")\n else \"No Date\"\n )\n author = (\n div.find(\"span\", class_=\"author\").text.strip()\n if div.find(\"span\", class_=\"author\")\n else \"No Author\"\n )\n data.append((title, date, author))\n\n df = pd.DataFrame(data, columns=[\"Title\", \"Date\", \"Author\"])\n df.to_csv(csv_file_path, index=False)\n\n return data", "test": "import unittest\nfrom unittest.mock import patch\nimport os\nimport shutil\n# Mock HTML content\ntest_data_1_html = \"\"\"\n\n
\n

Title1

\n Date1\n Author1\n
\n
\n

Title2

\n Date2\n Author2\n
\n\n\"\"\"\ntest_data_2_html = \"\"\"\n\n
\n

TitleA

\n DateA\n AuthorA\n
\n\n\"\"\"\nclass MockResponse:\n \"\"\"Mock class for requests.Response\"\"\"\n def __init__(self, text, status_code):\n self.text = text\n self.status_code = status_code\n def raise_for_status(self):\n if self.status_code != 200:\n raise Exception(\"HTTP Error\")\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the f_837 function\"\"\"\n @classmethod\n def setUpClass(cls):\n \"\"\"Set up any necessary resources before any tests are run.\"\"\"\n os.makedirs(\"mnt/data\", exist_ok=True) # Create the directory for test files\n @patch(\"requests.get\")\n def test_html_parsing_multiple_entries(self, mock_get):\n \"\"\"Test parsing of HTML with multiple data entries.\"\"\"\n mock_get.return_value = MockResponse(test_data_1_html, 200)\n url = \"https://example.com/test_data_1.html\"\n csv_file_path = \"mnt/data/output_1.csv\"\n expected_output = [\n (\"Title1\", \"Date1\", \"Author1\"),\n (\"Title2\", \"Date2\", \"Author2\"),\n ]\n self.assertEqual(f_837(url, csv_file_path), expected_output)\n @patch(\"requests.get\")\n def test_html_parsing_single_entry(self, mock_get):\n \"\"\"Test parsing of HTML with a single data entry.\"\"\"\n mock_get.return_value = MockResponse(test_data_2_html, 200)\n url = \"https://example.com/test_data_2.html\"\n csv_file_path = \"mnt/data/output_2.csv\"\n expected_output = [(\"TitleA\", \"DateA\", \"AuthorA\")]\n self.assertEqual(f_837(url, csv_file_path), expected_output)\n @patch(\"requests.get\")\n def test_html_parsing_with_same_data_as_first(self, mock_get):\n \"\"\"Test parsing of HTML similar to first test case.\"\"\"\n mock_get.return_value = MockResponse(test_data_1_html, 200)\n url = \"https://example.com/test_data_1.html\"\n csv_file_path = \"mnt/data/output_3.csv\"\n expected_output = [\n (\"Title1\", \"Date1\", \"Author1\"),\n (\"Title2\", \"Date2\", \"Author2\"),\n ]\n self.assertEqual(f_837(url, csv_file_path), expected_output)\n @patch(\"requests.get\")\n def test_html_parsing_with_same_data_as_second(self, mock_get):\n \"\"\"Test parsing of HTML similar to second test case.\"\"\"\n mock_get.return_value = MockResponse(test_data_2_html, 200)\n url = \"https://example.com/test_data_2.html\"\n csv_file_path = \"mnt/data/output_4.csv\"\n expected_output = [(\"TitleA\", \"DateA\", \"AuthorA\")]\n self.assertEqual(f_837(url, csv_file_path), expected_output)\n @patch(\"requests.get\")\n def test_html_parsing_with_nonexistent_url(self, mock_get):\n \"\"\"Test handling of HTTP error when URL does not exist.\"\"\"\n mock_get.return_value = MockResponse(\"\", 404) # Simulating a 404 error\n url = \"https://example.com/non_existent.html\" # Non-existent URL\n csv_file_path = \"mnt/data/output_5.csv\"\n with self.assertRaises(Exception):\n f_837(url, csv_file_path) # Should raise HTTP Error\n @patch(\"requests.get\")\n def test_f_837_request_exception(self, mock_get):\n \"\"\"Test f_837 raises an exception when there is a request error.\"\"\"\n mock_get.side_effect = requests.RequestException(\"Error fetching URL\")\n url = \"https://example.com/non_existent.html\"\n csv_file_path = \"mnt/data/output_error.csv\"\n with self.assertRaises(Exception) as context:\n f_837(url, csv_file_path)\n self.assertIn(\"Error fetching URL\", str(context.exception))\n @classmethod\n def tearDownClass(cls):\n \"\"\"Clean up shared resources after all tests in the class have completed.\"\"\"\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["bs4.BeautifulSoup", "pandas.DataFrame", "requests.get", "requests.RequestException"], "libs": ["bs4", "pandas", "requests"], "doc": {"description": ["Extracts title, date, and author information from a webpage and writes the data to a CSV file.", "The function iterates through each 'div' element with a class 'container', extracting the text of 'h1', and 'span' elements with classes", "'date' and 'author', respectively. Default values ('No Title', 'No Date', or 'No Author') are used if an element is", "not found. The extracted data is stored in a list of tuples.", "The list of tuples is then converted into a Pandas DataFrame and saved to a CSV file at the specified file path.", "The DataFrame's columns are labeled as 'Title', 'Date', and 'Author'. The function returns the list of tuples."], "note": [], "params": ["url (str): The URL of the webpage to be parsed.", "csv_file_path (str): The path where the resulting CSV file will be saved."], "returns": ["list: A list of tuples containing the (title, date, author) extracted from the webpage. Default placeholders", "are used for missing information."], "reqs": ["requests", "bs4", "pandas"], "raises": ["RuntimeError: If the URL is incorrect or the server is down, the error message might be \"Error fetching URL: HTTP Error 404: Not Found\"", "or \"Error fetching URL: ConnectionError\". The function begins by making an HTTP request to the specified URL. It sets a timeout of 5 seconds to avoid", "prolonged waiting in case of unresponsive webpages. If the request encounters any exceptions such as connection errors, timeouts, or HTTP errors, a 'requests.RequestException' is raised.", "The function raises a '' with a message that includes the details of the exception. For example,, depending on the specific issue encountered."], "example": [">>> data = f_837('https://example.com/articles', '/path/to/save/csv/file.csv')", ">>> type(data)", "", ">>> len(data) > 0", "True"]}} -{"task_id": "f_740", "prompt": "from collections import Counter\nimport random\n\nLETTERS = ['a', 'b', 'c', 'd', 'e']\n\ndef f_740(count, seed=0):\n \"\"\"\n Generate a specific number of random letter pairs, each from a predefined list, and analyze the frequency of each pair.\n\n Parameters:\n - count (int): The number of letter pairs to generate.\n - seed (int, optional): Seed for the random number generator to ensure reproducibility. Defaults to None.\n\n Returns:\n - Counter: A Counter object representing the frequency of each generated letter pair.\n\n Requirements:\n - collections.Counter\n - random\n\n Examples:\n >>> f_740(5, seed=42)\n Counter({('d', 'a'): 1, ('b', 'b'): 1, ('d', 'd'): 1, ('e', 'a'): 1, ('c', 'a'): 1})\n >>> f_740(0, seed=42)\n Counter()\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n pairs = [tuple(random.choices(LETTERS, k=2)) for _ in range(count)]\n pair_frequency = Counter(pairs)\n\n return pair_frequency", "test": "import unittest\nfrom collections import Counter\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Initialize random seed for reproducibility in tests\n random.seed(42)\n def test_case_1(self):\n # Test with count = 5\n result = f_740(5, seed=42)\n self.assertIsInstance(result, Counter)\n self.assertEqual(result, Counter({('d', 'a'): 1, ('b', 'b'): 1, ('d', 'd'): 1, ('e', 'a'): 1, ('c', 'a'): 1}))\n def test_case_2(self):\n # Test with count = 0 (no pairs)\n result = f_740(0, seed=4)\n self.assertEqual(result, Counter())\n def test_case_3(self):\n # Test with count = 100 (larger number)\n result = f_740(100, seed=2)\n self.assertEqual(sum(result.values()), 100)\n def test_case_4(self):\n # Test with count = 10 and check if all pairs have letters from the defined LETTERS\n result = f_740(10, seed=0)\n self.assertEqual(result, Counter({('c', 'c'): 2, ('d', 'b'): 2, ('e', 'e'): 2, ('e', 'd'): 1, ('c', 'b'): 1, ('e', 'c'): 1, ('b', 'd'): 1}))\n def test_case_5(self):\n # Test with count = 5 and check if the total counts match the input count\n result = f_740(5, seed=1)\n self.assertEqual(result, Counter({('a', 'e'): 1, ('d', 'b'): 1, ('c', 'c'): 1, ('d', 'd'): 1, ('a', 'a'): 1}))", "apis": ["random.choices", "random.seed", "collections.Counter"], "libs": ["collections", "random"], "doc": {"description": ["Generate a specific number of random letter pairs, each from a predefined list, and analyze the frequency of each pair."], "note": [], "params": ["count (int): The number of letter pairs to generate.", "seed (int, optional): Seed for the random number generator to ensure reproducibility. Defaults to None."], "returns": ["Counter: A Counter object representing the frequency of each generated letter pair."], "reqs": ["collections.Counter", "random"], "raises": [], "example": ["Examples:", ">>> f_740(5, seed=42)", "Counter({('d', 'a'): 1, ('b', 'b'): 1, ('d', 'd'): 1, ('e', 'a'): 1, ('c', 'a'): 1})", ">>> f_740(0, seed=42)", "Counter()"]}} -{"task_id": "f_869", "prompt": "import numpy as np\nfrom scipy.stats import ttest_ind\nimport matplotlib.pyplot as plt\n\n\ndef f_869(kwargs):\n \"\"\"\n Performs a two-sample t-test on numerical data from two groups to determine if there is a significant\n difference in their means. The function handles NaN values, computes descriptive statistics for each group,\n and generates a boxplot and histograms for data visualization.\n\n Parameters:\n - kwargs (dict): A dictionary with two keys, 'group1' and 'group2'. Each key maps to a list of numbers.\n Lists can contain NaN values, which will be excluded from analysis.\n\n Returns:\n - dict: A dictionary containing:\n - 'significant': Boolean. True if the means of the two groups are significantly different (p < 0.05).\n - 'group1_stats': Dictionary with mean and standard deviation of 'group1' (excluding NaNs).\n - 'group2_stats': Dictionary with mean and standard deviation of 'group2' (excluding NaNs).\n - 'ax_boxplot': A matplotlib Axes object with a boxplot comparing 'group1' and 'group2'.\n - 'ax_histogram': A matplotlib Axes object with histograms of 'group1' and 'group2'.\n\n Raises:\n - ValueError: If either group is empty, contains only NaN values, has less than two non-NaN values,\n or if the variance in one or both groups is below a threshold (1e-8).\n\n Requirements:\n - numpy\n - scipy\n - matplotlib\n\n Note:\n - The function sets the significance level (alpha) at 0.05.\n - It removes NaN values before performing any calculations or plotting.\n - A t-test is performed with the 'nan_policy' set to 'omit' to ignore NaNs.\n - The function checks for sufficient non-NaN data points and adequate variance in each group before conducting the t-test.\n - The boxplot and histograms provide a visual comparison of the data distributions.\n \n Example:\n >>> data = {'group1': [1, 2, 3, 4], 'group2': [5, 6, 7, 8]}\n >>> results = f_869(data)\n >>> results['significant']\n True\n \"\"\"", "canonical_solution": " alpha = 0.05 # Define the significance level\n\n group1 = np.array(kwargs.get(\"group1\", []))\n group2 = np.array(kwargs.get(\"group2\", []))\n\n # Check for empty or all-NaN groups\n if (\n len(group1) == 0\n or len(group2) == 0\n or np.all(np.isnan(group1))\n or np.all(np.isnan(group2))\n ):\n raise ValueError(\"One or both groups are empty or contain only NaN values.\")\n\n # Removing NaN values and ensuring sufficient data\n valid_group1 = group1[~np.isnan(group1)]\n valid_group2 = group2[~np.isnan(group2)]\n\n # Check for sufficient size and variance\n if len(valid_group1) < 2 or len(valid_group2) < 2:\n raise ValueError(\"Each group must have at least two non-NaN values.\")\n\n if np.var(valid_group1) < 1e-8 or np.var(valid_group2) < 1e-8:\n raise ValueError(\"Variance in one or both groups is too low.\")\n\n # Perform t-test\n _, p_val = ttest_ind(valid_group1, valid_group2, nan_policy=\"omit\")\n\n significant = p_val < alpha\n\n # Calculate descriptive statistics\n group1_stats = {\"mean\": np.mean(valid_group1), \"std\": np.std(valid_group1)}\n group2_stats = {\"mean\": np.mean(valid_group2), \"std\": np.std(valid_group2)}\n\n # Plotting\n _, (ax_boxplot, ax_histogram) = plt.subplots(2, 1, figsize=(8, 12))\n\n # Boxplot\n ax_boxplot.boxplot([valid_group1, valid_group2], labels=[\"group1\", \"group2\"])\n\n # Histogram\n ax_histogram.hist(valid_group1, alpha=0.5, label=\"group1\")\n ax_histogram.hist(valid_group2, alpha=0.5, label=\"group2\")\n ax_histogram.legend()\n\n return {\n \"significant\": significant,\n \"group1_stats\": group1_stats,\n \"group2_stats\": group2_stats,\n \"ax_boxplot\": ax_boxplot,\n \"ax_histogram\": ax_histogram,\n }", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n def test_different_means(self):\n \"\"\"Test with groups having significantly different means.\"\"\"\n data = {\"group1\": [1, 2, 3], \"group2\": [4, 5, 6]}\n result = f_869(data)\n self.assertTrue(result[\"significant\"])\n def test_similar_means(self):\n \"\"\"Test with groups having similar means.\"\"\"\n data = {\"group1\": [1, 2, 3], \"group2\": [1, 2, 3]}\n result = f_869(data)\n self.assertFalse(result[\"significant\"])\n def test_with_nan_values(self):\n \"\"\"Test with groups containing NaN values but with at least two non-NaN values in each group.\"\"\"\n data = {\"group1\": [np.nan, 2, 3], \"group2\": [1, np.nan, 3]}\n result = f_869(data)\n self.assertIsNotNone(result)\n def test_empty_group(self):\n \"\"\"Test with one of the groups being empty.\"\"\"\n data = {\"group1\": [], \"group2\": [1, 2, 3]}\n with self.assertRaises(ValueError):\n f_869(data)\n def test_all_nan_values(self):\n \"\"\"Test with groups containing only NaN values.\"\"\"\n data = {\"group1\": [np.nan, np.nan], \"group2\": [np.nan, np.nan]}\n with self.assertRaises(ValueError):\n f_869(data)\n def test_insufficient_group_size(self):\n \"\"\"Test with one of the groups having less than two non-NaN values.\"\"\"\n data = {\"group1\": [1, np.nan], \"group2\": [2, 3, 4]}\n with self.assertRaises(ValueError):\n f_869(data)\n def test_low_variance(self):\n \"\"\"Test with one of the groups having extremely low variance.\"\"\"\n data = {\"group1\": [1.00000001, 1.00000002], \"group2\": [2, 3, 4]}\n with self.assertRaises(ValueError):\n f_869(data)", "apis": ["numpy.mean", "numpy.std", "numpy.isnan", "numpy.all", "numpy.var", "numpy.array", "matplotlib.pyplot.subplots", "scipy.stats.ttest_ind"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Performs a two-sample t-test on numerical data from two groups to determine if there is a significant", "difference in their means. The function handles NaN values, computes descriptive statistics for each group,", "and generates a boxplot and histograms for data visualization."], "note": ["The function sets the significance level (alpha) at 0.05.", "It removes NaN values before performing any calculations or plotting.", "A t-test is performed with the 'nan_policy' set to 'omit' to ignore NaNs.", "The function checks for sufficient non-NaN data points and adequate variance in each group before conducting the t-test.", "The boxplot and histograms provide a visual comparison of the data distributions."], "params": ["kwargs (dict): A dictionary with two keys, 'group1' and 'group2'. Each key maps to a list of numbers.", "Lists can contain NaN values, which will be excluded from analysis."], "returns": ["dict: A dictionary containing:", "'significant': Boolean. True if the means of the two groups are significantly different (p < 0.05).", "'group1_stats': Dictionary with mean and standard deviation of 'group1' (excluding NaNs).", "'group2_stats': Dictionary with mean and standard deviation of 'group2' (excluding NaNs).", "'ax_boxplot': A matplotlib Axes object with a boxplot comparing 'group1' and 'group2'.", "'ax_histogram': A matplotlib Axes object with histograms of 'group1' and 'group2'."], "reqs": ["numpy", "scipy", "matplotlib"], "raises": ["ValueError: If either group is empty, contains only NaN values, has less than two non-NaN values,", "or if the variance in one or both groups is below a threshold (1e-8)."], "example": [">>> data = {'group1': [1, 2, 3, 4], 'group2': [5, 6, 7, 8]}", ">>> results = f_869(data)", ">>> results['significant']", "True"]}} -{"task_id": "f_580", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LinearRegression\n\ndef f_580(df):\n \"\"\"\n Use a linear regression model to predict the \"value\" of \"feature\" in the given dataframe and return the coefficients and intercept.\n\n Parameters:\n - df (pd.DataFrame): pandas DataFrame that contains columns named 'feature' and 'value'.\n\n Returns:\n - result (dict): A dictionary with the coefficients and the intercept of the fitted linear regression model.\n\n Requirements:\n - pandas\n - numpy\n - sklearn\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame({'feature': np.random.rand(100), 'value': np.random.rand(100)})\n >>> coefficients = f_580(df)\n >>> print(coefficients)\n {'coefficients': [[-0.03353164387961974]], 'intercept': [0.5135976564010359]}\n \"\"\"", "canonical_solution": " X = np.array(df['feature']).reshape(-1,1) # Explicitly converting to numpy array and reshaping\n y = np.array(df['value']).reshape(-1,1) # Explicitly converting to numpy array and reshaping\n\n model = LinearRegression().fit(X, y)\n\n return {'coefficients': model.coef_.tolist(), 'intercept': model.intercept_.tolist()}", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'feature': np.random.rand(100), 'value': np.random.rand(100)})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n def test_case_2(self):\n df = pd.DataFrame({'feature': [1, 2, 3, 4, 5], 'value': [1, 2, 3, 4, 5]})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n self.assertAlmostEqual(coefficients['coefficients'][0][0], 1.0)\n self.assertAlmostEqual(coefficients['intercept'][0], 0.0)\n def test_case_3(self):\n df = pd.DataFrame({'feature': [1, 2, 3, 4, 5], 'value': [2, 4, 6, 8, 10]})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n self.assertAlmostEqual(coefficients['coefficients'][0][0], 2.0)\n self.assertAlmostEqual(coefficients['intercept'][0], 0.0)\n def test_case_4(self):\n df = pd.DataFrame({'feature': [0, 0, 0, 0, 0], 'value': [1, 2, 3, 4, 5]})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n self.assertAlmostEqual(coefficients['coefficients'][0][0], 0.0)\n self.assertAlmostEqual(coefficients['intercept'][0], 3.0)\n def test_case_5(self):\n df = pd.DataFrame({'feature': [1, 2, 3, 4, 5], 'value': [0, 0, 0, 0, 0]})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n self.assertAlmostEqual(coefficients['coefficients'][0][0], 0.0)\n self.assertAlmostEqual(coefficients['intercept'][0], 0.0)", "apis": ["numpy.array", "sklearn.linear_model.LinearRegression"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Use a linear regression model to predict the \"value\" of \"feature\" in the given dataframe and return the coefficients and intercept."], "note": [], "params": ["df (pd.DataFrame): pandas DataFrame that contains columns named 'feature' and 'value'."], "returns": ["result (dict): A dictionary with the coefficients and the intercept of the fitted linear regression model."], "reqs": ["pandas", "numpy", "sklearn"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame({'feature': np.random.rand(100), 'value': np.random.rand(100)})", ">>> coefficients = f_580(df)", ">>> print(coefficients)", "{'coefficients': [[-0.03353164387961974]], 'intercept': [0.5135976564010359]}"]}} -{"task_id": "f_528", "prompt": "import heapq\nimport collections\n\ndef f_528(x, n):\n \"\"\"\n Find the n most common letters in a dictionary, x, where the key letters and the values are their frequencies.\n\n Parameters:\n - x (dict): The dictionary of letter frequencies.\n - n (int): The number of most frequent letters to return.\n\n Returns:\n - list: The n most frequent letters.\n\n Requirements:\n - heapq\n - collections\n\n Example:\n >>> f_528({'a': 1, 'b': 2, 'c': 3}, 2)\n ['c', 'b']\n \"\"\"", "canonical_solution": " counter = collections.Counter(x)\n most_frequent = heapq.nlargest(n, counter.keys(), key=counter.get)\n\n return most_frequent", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 2), ['c', 'b'])\n def test_case_2(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 1), ['c'])\n def test_case_3(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 3), ['c', 'b', 'a'])\n def test_case_4(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 0), [])\n def test_case_5(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 4), ['c', 'b', 'a'])", "apis": ["collections.Counter", "heapq.nlargest"], "libs": ["collections", "heapq"], "doc": {"description": ["Find the n most common letters in a dictionary, x, where the key letters and the values are their frequencies."], "note": [], "params": ["x (dict): The dictionary of letter frequencies.", "n (int): The number of most frequent letters to return."], "returns": ["list: The n most frequent letters."], "reqs": ["heapq", "collections"], "raises": [], "example": [">>> f_528({'a': 1, 'b': 2, 'c': 3}, 2)", "['c', 'b']"]}} -{"task_id": "f_413", "prompt": "import json\nimport numpy as np\nfrom collections import defaultdict\nimport matplotlib.pyplot as plt\n\n\ndef f_413(input_file):\n \"\"\"\n Reads a JSON file containing a list of dictionaries. For each key across all dictionaries,\n calculates the mean and median of its values using numpy. Visualizes the mean and median\n using bar charts. Returns the results and plots.\n\n Parameters:\n - input_file (str): Path to the input JSON file containing a list of dictionaries.\n\n Returns:\n - result (dict): each key corresponds to those in the input dictionaries, and the corresponding\n value is another dict with keys 'mean' and 'median', representing the calculated statistics.\n - plots (list[matplotlib.axes._subplots.AxesSubplot]): A list of bar charts, one for\n each key in the dictionaries, visualizing the mean and median values.\n\n Requirements:\n - json\n - numpy\n - collections.defaultdict\n - matplotlib.pyplot\n\n Example:\n >>> results, plots = f_413(\"sample_data.json\")\n >>> type(plots[0])\n \n >>> results\n {'a': {'mean': 3.0, 'median': 3.0}, 'b': {'mean': 6.0, 'median': 6.0}}\n \"\"\"", "canonical_solution": " with open(input_file, \"r\") as f:\n data = json.load(f)\n\n stats = defaultdict(list)\n for d in data:\n for key, value in d.items():\n stats[key].append(value)\n\n result = {k: {\"mean\": np.mean(v), \"median\": np.median(v)} for k, v in stats.items()}\n\n plots = []\n for key, values in result.items():\n _, ax = plt.subplots()\n ax.bar([\"mean\", \"median\"], [values[\"mean\"], values[\"median\"]])\n ax.set_title(f\"Statistics of {key}\")\n plots.append(ax)\n return result, plots", "test": "import matplotlib\nimport unittest\nimport tempfile\nimport os\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.temp_dir = tempfile.TemporaryDirectory()\n cls.test_data = {\n \"test_1.json\": [{\"a\": 2, \"b\": 4}, {\"a\": 4, \"b\": 8}],\n \"test_2.json\": [{\"x\": 1}, {\"y\": 2}, {\"z\": 6}],\n \"invalid.json\": {\"not\": \"valid\"},\n \"empty.json\": [],\n }\n # Generate test files\n for filename, content in cls.test_data.items():\n with open(os.path.join(cls.temp_dir.name, filename), \"w\") as f:\n json.dump(content, f)\n @classmethod\n def tearDownClass(cls):\n cls.temp_dir.cleanup()\n plt.close(\"all\")\n def test_case_1(self):\n # Check plot generation\n expected_titles = [\"a\", \"b\"]\n _, plots = f_413(os.path.join(self.temp_dir.name, \"test_1.json\"))\n self.assertEqual(len(plots), len(expected_titles))\n for plot, title in zip(plots, expected_titles):\n assert isinstance(plot, matplotlib.axes._axes.Axes)\n self.assertTrue(plot.get_title(), f\"Statistics of {title}\")\n def test_case_2(self):\n # Check result correctness\n results, _ = f_413(os.path.join(self.temp_dir.name, \"test_1.json\"))\n self.assertIn(\"a\", results)\n self.assertIn(\"b\", results)\n self.assertEqual(results[\"a\"][\"mean\"], 3.0)\n self.assertEqual(results[\"a\"][\"median\"], 3.0)\n self.assertEqual(results[\"b\"][\"mean\"], 6.0)\n self.assertEqual(results[\"b\"][\"median\"], 6.0)\n def test_case_3(self):\n # Test with invalid data structure (not a list of dicts)\n with self.assertRaises(AttributeError):\n f_413(os.path.join(self.temp_dir.name, \"invalid.json\"))\n def test_case_4(self):\n # Test with empty data\n results, plots = f_413(os.path.join(self.temp_dir.name, \"empty.json\"))\n self.assertEqual(results, {})\n self.assertEqual(len(plots), 0)\n def test_case_5(self):\n # Test handling nested dicts with one key each\n results, _ = f_413(os.path.join(self.temp_dir.name, \"test_2.json\"))\n self.assertIn(\"x\", results)\n self.assertIn(\"y\", results)\n self.assertIn(\"z\", results)\n self.assertEqual(results[\"x\"][\"mean\"], 1.0)\n self.assertEqual(results[\"x\"][\"median\"], 1.0)\n self.assertEqual(results[\"y\"][\"mean\"], 2.0)\n self.assertEqual(results[\"y\"][\"median\"], 2.0)\n self.assertEqual(results[\"z\"][\"mean\"], 6.0)\n self.assertEqual(results[\"z\"][\"median\"], 6.0)\n def test_case_6(self):\n # Test with nonexistent filename\n with self.assertRaises(FileNotFoundError):\n f_413(os.path.join(self.temp_dir.name, \"NOTEXISTS.json\"))", "apis": ["numpy.median", "numpy.mean", "json.load", "collections.defaultdict", "matplotlib.pyplot.subplots"], "libs": ["numpy", "collections", "json", "matplotlib"], "doc": {"description": ["Reads a JSON file containing a list of dictionaries. For each key across all dictionaries,", "calculates the mean and median of its values using numpy. Visualizes the mean and median", "using bar charts. Returns the results and plots."], "note": [], "params": ["input_file (str): Path to the input JSON file containing a list of dictionaries."], "returns": ["result (dict): each key corresponds to those in the input dictionaries, and the corresponding", "value is another dict with keys 'mean' and 'median', representing the calculated statistics.", "plots (list[matplotlib.axes._subplots.AxesSubplot]): A list of bar charts, one for", "each key in the dictionaries, visualizing the mean and median values."], "reqs": ["json", "numpy", "collections.defaultdict", "matplotlib.pyplot"], "raises": [], "example": [">>> results, plots = f_413(\"sample_data.json\")", ">>> type(plots[0])", "", ">>> results", "{'a': {'mean': 3.0, 'median': 3.0}, 'b': {'mean': 6.0, 'median': 6.0}}"]}} -{"task_id": "f_917", "prompt": "import time\nimport matplotlib.pyplot as plt\n\n\ndef f_917(time_strings, time_format=\"%d/%m/%Y %H:%M:%S.%f\"):\n \"\"\"\n Parses a list of time strings and plots a histogram of the seconds component.\n\n Parameters:\n - time_strings (list of str): A list of time strings to be parsed. Each string in the list should\n be formatted according to the 'time_format' parameter.\n - time_format (str): The format string for parsing the time strings in 'time_strings'.\n The default format is '%d/%m/%Y %H:%M:%S.%f', representing day/month/year hours:minutes:seconds.microseconds.\n\n Returns:\n - ax (matplotlib.axes._subplots.AxesSubplot or None): An AxesSubplot object with the histogram plotted if\n parsing is successful. Returns None if a parsing error occurs.\n\n Requirements:\n - time\n - matplotlib\n \n Raises:\n - ValueError: If any time string in 'time_strings' cannot be parsed according to 'time_format'.\n\n Example:\n >>> time_strings = ['30/03/2009 16:31:32.123', '15/04/2010 14:25:46.789', '20/12/2011 12:34:56.000']\n >>> ax = f_917(time_strings)\n >>> plt.show() # Display the plot\n \"\"\"", "canonical_solution": " try:\n seconds = [time.strptime(ts, time_format).tm_sec for ts in time_strings]\n _, ax = plt.subplots()\n ax.hist(seconds, bins=60, rwidth=0.8)\n return ax\n except ValueError as e:\n print(f\"Error parsing time strings: {e}\")\n return None", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_917.\"\"\"\n def test_histogram_counts(self):\n \"\"\"Test the counts in the histogram.\"\"\"\n time_strings = [\n \"30/03/2009 16:31:32.123\",\n \"15/04/2010 14:25:46.789\",\n \"20/12/2011 12:34:56.000\",\n ]\n ax = f_917(time_strings)\n # Extract histogram data\n n_values = [patch.get_height() for patch in ax.patches]\n # Check the count of values in each bin\n self.assertTrue(1 in n_values)\n def test_histogram_title(self):\n \"\"\"Test the title of the histogram.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"]\n ax = f_917(time_strings)\n self.assertEqual(ax.get_title(), \"\")\n def test_histogram_xaxis(self):\n \"\"\"Test the x-axis label of the histogram.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"]\n ax = f_917(time_strings)\n self.assertEqual(ax.get_xlabel(), \"\")\n def test_histogram_yaxis(self):\n \"\"\"Test the y-axis label of the histogram.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"]\n ax = f_917(time_strings)\n self.assertEqual(ax.get_ylabel(), \"\")\n def test_large_input(self):\n \"\"\"Test with a large input.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"] * 50\n ax = f_917(time_strings)\n # Extract histogram data\n n_values = [patch.get_height() for patch in ax.patches]\n # Check the count of values in the specific bin corresponding to the seconds value \"32\"\n self.assertTrue(50 in n_values)\n def test_invalid_time_format(self):\n \"\"\"Test with an invalid time format.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"]\n ax = f_917(time_strings, time_format=\"%d/%m/%Y %H:%M:%S\")\n self.assertIsNone(ax)\n def tearDown(self):\n plt.close()", "apis": ["time.strptime", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "time"], "doc": {"description": ["Parses a list of time strings and plots a histogram of the seconds component."], "note": [], "params": ["time_strings (list of str): A list of time strings to be parsed. Each string in the list should", "be formatted according to the 'time_format' parameter.", "time_format (str): The format string for parsing the time strings in 'time_strings'.", "The default format is '%d/%m/%Y %H:%M:%S.%f', representing day/month/year hours:minutes:seconds.microseconds."], "returns": ["ax (matplotlib.axes._subplots.AxesSubplot or None): An AxesSubplot object with the histogram plotted if", "parsing is successful. Returns None if a parsing error occurs."], "reqs": ["time", "matplotlib"], "raises": ["ValueError: If any time string in 'time_strings' cannot be parsed according to 'time_format'."], "example": [">>> time_strings = ['30/03/2009 16:31:32.123', '15/04/2010 14:25:46.789', '20/12/2011 12:34:56.000']", ">>> ax = f_917(time_strings)", ">>> plt.show() # Display the plot"]}} -{"task_id": "f_542", "prompt": "import pandas as pd\nimport json\n\n\ndef f_542(file_path, key):\n \"\"\"\n Load a JSON file into a Pandas DataFrame, remove a specific key from each object and write the processed DataFrame back into a JSON file.\n \n Parameters:\n - file_path (str): The path to the JSON file.\n - key (str): The key to remove from each object.\n \n Returns:\n - df (DataFrame): A pandas DataFrame representation of the processed JSON data.\n\n Requirements:\n - pandas\n - json\n \n Example:\n >>> df = f_542('data.json', 'ele')\n \"\"\"", "canonical_solution": " with open(file_path, 'r') as file:\n data = json.load(file)\n\n df = pd.DataFrame(data)\n df.drop(key, axis=1, inplace=True)\n\n with open(file_path, 'w') as file:\n file.write(df.to_json(orient='records'))\n\n return df", "test": "import unittest\nimport os\nclass TestCases(unittest.TestCase):\n def base(self, json_path, key, contents):\n # Create JSON file\n with open(json_path, 'w') as file:\n json.dump(contents, file)\n # Run function\n df = f_542(json_path, key)\n # Check key is removed\n self.assertFalse(key in df.columns)\n # Check JSON file is updated\n with open(json_path, 'r') as file:\n data = json.load(file)\n self.assertFalse(key in data[0])\n # Remove JSON file\n os.remove(json_path)\n def test_case_1(self):\n self.base('data.json', 'ele', [{'ele': 1, 'a': 2}, {'ele': 3, 'a': 4}])\n def test_case_2(self):\n self.base('data.json', 'ele', [{'ele': 1, 'a': 2}, {'ele': 3, 'a': 4}, {'ele': 5, 'a': 6}])\n def test_case_3(self):\n self.base('x.json', 'zzz', [{'zzz': 1, 'a': 2}, {'zzz': 3, 'a': 4}])\n def test_case_4(self):\n self.base('g.json', 'ele', [{'ele': 1, 'a': 2}, {'ele': 3, 'a': 4}])\n def test_case_5(self):\n self.base('data.json', 'ele', [{'ele': 1, 'a': 2}, {'ele': 3, 'a': 4}])", "apis": ["pandas.DataFrame", "json.load"], "libs": ["pandas", "json"], "doc": {"description": ["Load a JSON file into a Pandas DataFrame, remove a specific key from each object and write the processed DataFrame back into a JSON file."], "note": [], "params": ["file_path (str): The path to the JSON file.", "key (str): The key to remove from each object."], "returns": ["df (DataFrame): A pandas DataFrame representation of the processed JSON data."], "reqs": ["pandas", "json"], "raises": [], "example": [">>> df = f_542('data.json', 'ele')"]}} -{"task_id": "f_847", "prompt": "import urllib.request\nimport re\nfrom collections import Counter\nimport matplotlib.pyplot as plt\n\n\ndef f_847(url):\n \"\"\"\n Downloads a text file from a specified URL, processes the text to count the frequency of each word,\n and then plots a bar chart showing the ten most frequently occurring words.\n\n Parameters:\n url (str): The URL from which the text file is to be downloaded. The URL should point directly to a text file.\n\n Returns:\n tuple: A tuple containing two elements:\n - Counter: A Counter object from the collections module, containing word frequencies in the text.\n - Axes: A matplotlib Axes object that represents the plotted bar chart of the ten most common words.\n\n Note:\n - The function assumes the URL points to a plain text file and may not handle binary files or non-text content correctly.\n - Words are identified using a basic regular expression and are case-sensitive.\n - The function does not remove common stopwords; all words are counted as is.\n - Requires internet access to download the file from the URL.\n\n Example:\n >>> word_freq, ax = f_847('http://www.example.com/data.txt')\n >>> print(word_freq.most_common(5))\n [('the', 102), ('of', 76), ('and', 64), ('to', 52), ('in', 41)]\n\n Requirements:\n - urllib\n - re\n - collections\n - matplotlib\n \n \"\"\"", "canonical_solution": " with urllib.request.urlopen(url) as response:\n text = response.read().decode()\n words = re.findall(r\"\\b\\w+\\b\", text)\n word_freq = Counter(words)\n top_words = word_freq.most_common(10)\n\n _, ax = plt.subplots()\n ax.bar([word[0] for word in top_words], [word[1] for word in top_words])\n ax.set_title(\"Top 10 Most Common Words\")\n ax.set_xlabel(\"Words\")\n ax.set_ylabel(\"Frequency\")\n\n return word_freq, ax", "test": "import unittest\nfrom unittest.mock import patch\nfrom collections import Counter\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_847 function.\"\"\"\n @patch(\"urllib.request.urlopen\")\n def test_word_frequencies(self, mock_urlopen):\n \"\"\"Test that the function returns the correct word frequencies.\"\"\"\n # Mock the response data\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n b\"OpenAI OpenAI OpenAI benefits\"\n )\n word_freq, ax = f_847(\"http://example.com\")\n self.assertIsInstance(word_freq, Counter)\n self.assertEqual(word_freq[\"OpenAI\"], 3)\n self.assertEqual(word_freq[\"benefits\"], 1)\n self.assertIsNotNone(ax)\n @patch(\"urllib.request.urlopen\")\n def test_empty_file(self, mock_urlopen):\n \"\"\"Test that the function returns an empty Counter object for an empty file.\"\"\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = b\"\"\n word_freq, ax = f_847(\"http://example.com\")\n self.assertIsInstance(word_freq, Counter)\n self.assertEqual(len(word_freq), 0)\n self.assertIsNotNone(ax)\n @patch(\"urllib.request.urlopen\")\n def test_non_text_file(self, mock_urlopen):\n \"\"\"Test that the function raises an error for a non-text file.\"\"\"\n # Simulate a case where the URL does not point to a text file\n mock_urlopen.side_effect = Exception(\"Non-text file error\")\n with self.assertRaises(Exception):\n f_847(\"http://example.com\")\n @patch(\"urllib.request.urlopen\")\n def test_special_characters(self, mock_urlopen):\n \"\"\"Test that the function counts special characters as words.\"\"\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n b\"1234567890\"\n )\n word_freq, ax = f_847(\"http://example.com\")\n self.assertIsInstance(word_freq, Counter)\n self.assertEqual(word_freq[\"1234567890\"], 1)\n self.assertIsNotNone(ax)\n @patch(\"urllib.request.urlopen\")\n def test_large_input(self, mock_urlopen):\n \"\"\"Test that the function can handle a large input.\"\"\"\n # Mock a large input\n mock_text = \" \".join([\"OpenAI\"] * 10000)\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n mock_text.encode()\n )\n word_freq, ax = f_847(\"http://example.com\")\n self.assertIsInstance(word_freq, Counter)\n self.assertEqual(word_freq[\"OpenAI\"], 10000)\n self.assertIsNotNone(ax)\n def tearDown(self):\n plt.clf()", "apis": ["collections.Counter", "urllib.request.urlopen", "urllib.request", "matplotlib.pyplot.subplots", "re.findall"], "libs": ["urllib", "collections", "re", "matplotlib"], "doc": {"description": ["Downloads a text file from a specified URL, processes the text to count the frequency of each word,", "and then plots a bar chart showing the ten most frequently occurring words."], "note": ["The function assumes the URL points to a plain text file and may not handle binary files or non-text content correctly.", "Words are identified using a basic regular expression and are case-sensitive.", "The function does not remove common stopwords; all words are counted as is.", "Requires internet access to download the file from the URL."], "params": ["url (str): The URL from which the text file is to be downloaded. The URL should point directly to a text file."], "returns": ["tuple: A tuple containing two elements:", "Counter: A Counter object from the collections module, containing word frequencies in the text.", "Axes: A matplotlib Axes object that represents the plotted bar chart of the ten most common words."], "reqs": ["urllib", "re", "collections", "matplotlib"], "raises": [], "example": [">>> word_freq, ax = f_847('http://www.example.com/data.txt')", ">>> print(word_freq.most_common(5))", "[('the', 102), ('of', 76), ('and', 64), ('to', 52), ('in', 41)]"]}} -{"task_id": "f_534", "prompt": "import os\nimport random\n\ndef f_534(directory, n_files):\n \"\"\"\n Create n random txt files in a specific directory, write only a single digit random integer into each file, and then reset the cursor to the beginning of each file.\n\n Parameters:\n - directory (str): The directory in which to generate the files.\n - n_files (int): The number of files to generate.\n\n Returns:\n - n_files (int): The number of files generated.\n\n Requirements:\n - os\n - random\n\n Example:\n >>> random.seed(2)\n >>> f_534('/path/to/directory', 5)\n 5\n \"\"\"", "canonical_solution": " if not os.path.exists(directory):\n os.makedirs(directory)\n\n for i in range(n_files):\n filename = os.path.join(directory, f\"file_{i+1}.txt\")\n\n with open(filename, 'w') as file:\n file.write(str(random.randint(0, 9)))\n file.seek(0)\n\n return n_files", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def base(self, dir, n_files, contents):\n random.seed(42)\n # Create directory\n if not os.path.exists(dir):\n os.makedirs(dir)\n # Run function\n n = f_534(dir, n_files)\n # Check files\n self.assertEqual(n, n_files)\n read_data = []\n for f in sorted(os.listdir(dir)):\n self.assertTrue(f.endswith('.txt'))\n with open(os.path.join(dir, f), 'r') as file:\n read_data.append(file.read())\n file.seek(0)\n self.assertEqual(read_data, contents)\n def tearDown(self):\n shutil.rmtree('./directory', ignore_errors=True)\n shutil.rmtree('./dir', ignore_errors=True)\n shutil.rmtree('./d', ignore_errors=True)\n def test_case_1(self):\n self.base('./directory', 5, ['1', '0', '4', '3', '3'])\n def test_case_2(self):\n self.base('./dir', 10, ['1', '9', '0', '4', '3', '3', '2', '1', '8', '1'])\n def test_case_3(self):\n self.base('./d', 15, ['1', '9', '6', '0', '0', '1', '3', '0', '4', '3', '3', '2', '1', '8', '1'])\n def test_case_4(self):\n self.base('./d', 20, ['1', '9', '6', '0', '0', '1', '3', '3', '8', '9', '0', '0', '8', '4', '3', '3', '2', '1', '8', '1'])\n def test_case_5(self):\n self.base('./directory', 25, ['1', '9', '6', '0', '0', '1', '3', '3', '8', '9', '0', '0', '8', '3', '8', '6', '3', '7', '4', '3', '3', '2', '1', '8', '1'])", "apis": ["os.path.exists", "random.randint", "os.path", "os.makedirs", "os.path.join"], "libs": ["random", "os"], "doc": {"description": ["Create n random txt files in a specific directory, write only a single digit random integer into each file, and then reset the cursor to the beginning of each file."], "note": [], "params": ["directory (str): The directory in which to generate the files.", "n_files (int): The number of files to generate."], "returns": ["n_files (int): The number of files generated."], "reqs": ["os", "random"], "raises": [], "example": [">>> random.seed(2)", ">>> f_534('/path/to/directory', 5)", "5"]}} -{"task_id": "f_422", "prompt": "import sqlite3\nimport pandas as pd\nimport os\n\n\ndef f_422(db_name, table_name, csv_path=\"data.csv\"):\n \"\"\"\n Read SQLite3 table via pandas and export to a CSV file.\n\n Parameters:\n - db_name (str): The path to the SQLite3 database.\n - table_name (str): The name of the table to export.\n - csv_path (str, optional): The path where the CSV file will be saved. Defaults to 'data.csv'.\n\n Requirements:\n - sqlite3\n - pandas\n - os\n\n Returns:\n str: The absolute path of the exported CSV file.\n\n Example:\n >>> f_422('test.db', 'People')\n 'data.csv'\n >>> f_422('/absolute/path/to/test.db', 'Orders', 'orders.csv')\n '/absolute/path/to/orders.csv'\n \"\"\"", "canonical_solution": " try:\n conn = sqlite3.connect(db_name)\n df = pd.read_sql_query(f\"SELECT * from {table_name}\", conn)\n df.to_csv(csv_path, index=False)\n return os.path.abspath(csv_path)\n finally:\n conn.close()", "test": "import unittest\nimport os\nimport tempfile\nimport shutil\nimport sqlite3\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.temp_dir_obj = tempfile.TemporaryDirectory()\n cls.temp_dir = cls.temp_dir_obj.name\n cls.db_path = os.path.join(cls.temp_dir, \"test.db\")\n # Setup the database and tables\n conn = sqlite3.connect(cls.db_path)\n cursor = conn.cursor()\n # Create tables and insert some data\n cursor.execute(\"CREATE TABLE People (Name TEXT, Age INTEGER)\")\n cursor.execute(\n \"INSERT INTO People VALUES ('Alice', 30), ('Bob', 25), ('Charlie', 35)\"\n )\n cursor.execute(\"CREATE TABLE Orders (Product TEXT, Quantity INTEGER)\")\n cursor.execute(\n \"INSERT INTO Orders VALUES ('Widgets', 5), ('Gadgets', 10), ('Doodads', 15)\"\n )\n conn.commit()\n conn.close()\n @classmethod\n def tearDownClass(cls):\n cls.temp_dir_obj.cleanup()\n def test_case_1(self):\n # Test exporting the People table\n csv_path = os.path.join(self.temp_dir, \"data.csv\")\n output_path = f_422(self.db_path, \"People\", csv_path)\n self.assertTrue(os.path.exists(output_path), \"CSV file not created.\")\n df = pd.read_csv(output_path)\n self.assertEqual(len(df), 3, \"CSV contains incorrect number of rows.\")\n self.assertTrue(\"Alice\" in df[\"Name\"].values, \"Expected data not found in CSV.\")\n def test_case_2(self):\n # Test exporting the Orders table\n csv_path = os.path.join(self.temp_dir, \"orders.csv\")\n output_path = f_422(self.db_path, \"Orders\", csv_path)\n self.assertTrue(os.path.exists(output_path), \"CSV file not created.\")\n df = pd.read_csv(output_path)\n self.assertEqual(len(df), 3, \"CSV contains incorrect number of rows.\")\n self.assertTrue(5 in df[\"Quantity\"].values, \"Expected data not found in CSV.\")\n def test_case_3(self):\n # Test exporting with a custom CSV path\n custom_path = os.path.join(self.temp_dir, \"custom_data.csv\")\n output_path = f_422(self.db_path, \"People\", custom_path)\n self.assertTrue(\n os.path.exists(output_path), \"CSV file not created at custom path.\"\n )\n self.assertEqual(\n output_path,\n os.path.abspath(custom_path),\n \"Returned path does not match expected path.\",\n )\n def test_case_4(self):\n # Test with a non-existent database\n with self.assertRaises(Exception):\n f_422(os.path.join(self.temp_dir, \"nonexistent.db\"), \"People\")\n def test_case_5(self):\n # Test with a non-existent table\n with self.assertRaises(pd.io.sql.DatabaseError):\n f_422(self.db_path, \"NonexistentTable\")\n def test_case_6(self):\n # Test if the function overwrites an existing CSV file\n csv_path = os.path.join(self.temp_dir, \"data.csv\")\n with open(csv_path, \"w\") as file:\n file.write(\"Old Content\")\n output_path = f_422(self.db_path, \"People\", csv_path)\n self.assertTrue(os.path.exists(output_path), \"CSV file not created.\")\n with open(output_path, \"r\") as file:\n content = file.read()\n self.assertNotEqual(\n \"Old Content\", content, \"Old content found in CSV. Overwriting failed.\"\n )\n def test_case_7(self):\n # Test error handling with invalid CSV path\n with self.assertRaises(OSError):\n f_422(self.db_path, \"People\", \"/nonexistent_path/data.csv\")", "apis": ["os.path", "os.path.abspath", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["sqlite3", "pandas", "os"], "doc": {"description": ["Read SQLite3 table via pandas and export to a CSV file."], "note": [], "params": ["db_name (str): The path to the SQLite3 database.", "table_name (str): The name of the table to export.", "csv_path (str, optional): The path where the CSV file will be saved. Defaults to 'data.csv'."], "returns": ["str: The absolute path of the exported CSV file."], "reqs": ["sqlite3", "pandas", "os"], "raises": [], "example": [">>> f_422('test.db', 'People')", "'data.csv'", ">>> f_422('/absolute/path/to/test.db', 'Orders', 'orders.csv')", "'/absolute/path/to/orders.csv'"]}} -{"task_id": "f_541", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.preprocessing import StandardScaler\n\ndef f_541(df, features):\n \"\"\"\n Standardize the functions in a DataFrame.\n The function applies standard scaling to the features.\n \n Parameters:\n - df (pandas.DataFrame): The input DataFrame.\n - features (list): The list of features to standardize. May be empty.\n \n Returns:\n - df (pandas.DataFrame): The DataFrame with the standardized features.\n\n Requirements:\n - pandas\n - numpy\n - scikit-learn\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c'])\n >>> df = f_541(df, ['a', 'b'])\n >>> print(df)\n a b c\n 0 0.608932 0.127900 0.647689\n 1 2.025355 0.031682 -0.234137\n 2 2.102894 1.036701 -0.469474\n 3 0.672204 -0.198368 -0.465730\n 4 0.257348 -1.653196 -1.724918\n 5 -0.852601 -0.749663 0.314247\n 6 -1.329753 -1.150504 1.465649\n 7 -0.388180 0.334397 -1.424748\n 8 -0.827890 0.377940 -1.150994\n 9 0.441917 -0.336059 -0.291694\n 10 -0.907003 2.125260 -0.013497\n 11 -1.536337 1.092000 -1.220844\n 12 0.211669 -1.699745 -1.328186\n 13 0.195104 1.007633 0.171368\n 14 -0.236192 -0.035498 -1.478522\n 15 -1.070045 -0.195579 1.057122\n 16 0.397644 -1.502441 0.324084\n 17 -0.608039 -0.412603 0.611676\n 18 1.346302 1.201107 -0.839218\n 19 -0.503330 0.599035 0.975545\n \"\"\"", "canonical_solution": " if not features:\n return df\n\n # Initialize the StandardScaler\n scaler = StandardScaler()\n \n # Apply StandardScaler to the specified features\n # Using pd.DataFrame to explicitly reference DataFrame operations\n df.loc[:, features] = pd.DataFrame(scaler.fit_transform(df.loc[:, features]), columns=features, index=df.index)\n\n # Example of explicit np usage, even though not necessary for this function\n # Just for demonstration: add a dummy operation using np\n df['dummy'] = np.zeros(len(df))\n\n return df.drop('dummy', axis=1) ", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])\n df = f_541(df, ['a', 'b'])\n self.assertEqual(df.shape, (10, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] >= -3) and np.all(df['a'] <= 3))\n self.assertTrue(np.all(df['b'] >= -3) and np.all(df['b'] <= 3))\n self.assertTrue(np.all(df['c'] >= -3) and np.all(df['c'] <= 3))\n def test_case_2(self):\n df = pd.DataFrame({'a': [0, 0, 0], 'b': [0, 0, 0], 'c': [0, 0, 0]})\n df = f_541(df, ['a', 'b'])\n self.assertEqual(df.shape, (3, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] == 0))\n self.assertTrue(np.all(df['b'] == 0))\n self.assertTrue(np.all(df['c'] == 0))\n def test_case_3(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n df = f_541(df, ['a', 'b'])\n self.assertEqual(df.shape, (3, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] >= -3) and np.all(df['a'] <= 3))\n self.assertTrue(np.all(df['b'] >= -3) and np.all(df['b'] <= 3))\n self.assertTrue(np.all(df['c'] == [7, 8, 9]))\n def test_case_4(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n df = f_541(df, ['c'])\n self.assertEqual(df.shape, (3, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] == [1, 2, 3]))\n self.assertTrue(np.all(df['b'] == [4, 5, 6]))\n self.assertTrue(np.all(df['c'] >= -3) and np.all(df['c'] <= 3))\n def test_case_5(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n df = f_541(df, [])\n self.assertEqual(df.shape, (3, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] == [1, 2, 3]))\n self.assertTrue(np.all(df['b'] == [4, 5, 6]))\n self.assertTrue(np.all(df['c'] == [7, 8, 9]))", "apis": ["numpy.zeros", "pandas.DataFrame", "sklearn.preprocessing.StandardScaler"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Standardize the functions in a DataFrame.", "The function applies standard scaling to the features."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame.", "features (list): The list of features to standardize. May be empty."], "returns": ["df (pandas.DataFrame): The DataFrame with the standardized features."], "reqs": ["pandas", "numpy", "scikit-learn"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c'])", ">>> df = f_541(df, ['a', 'b'])", ">>> print(df)", "a b c", "0 0.608932 0.127900 0.647689", "1 2.025355 0.031682 -0.234137", "2 2.102894 1.036701 -0.469474", "3 0.672204 -0.198368 -0.465730", "4 0.257348 -1.653196 -1.724918", "5 -0.852601 -0.749663 0.314247", "6 -1.329753 -1.150504 1.465649", "7 -0.388180 0.334397 -1.424748", "8 -0.827890 0.377940 -1.150994", "9 0.441917 -0.336059 -0.291694", "10 -0.907003 2.125260 -0.013497", "11 -1.536337 1.092000 -1.220844", "12 0.211669 -1.699745 -1.328186", "13 0.195104 1.007633 0.171368", "14 -0.236192 -0.035498 -1.478522", "15 -1.070045 -0.195579 1.057122", "16 0.397644 -1.502441 0.324084", "17 -0.608039 -0.412603 0.611676", "18 1.346302 1.201107 -0.839218", "19 -0.503330 0.599035 0.975545"]}} +{"task_id": "f_377", "prompt": "import random\nimport string\nimport pandas as pd\n\n\ndef f_377(data_list, seed=0):\n \"\"\"\n Replace a random substring (a sequence of characters between two commas or at the beginning/end of the string)\n in a list of strings with a random string (comprising ascii lowercase characters) with the same length as\n the substituted characters.\n\n Parameters:\n data_list (list): Input list of strings.\n Within each string, each substring's leading and trailing whitespaces are removed.\n If empty, it will return a DataFrame with the Original String and Modified String\n columns that is otherwise empty.\n seed (int, optional): The seed for random operations to ensure reproducibility. Defaults to 0.\n\n Returns:\n DataFrame: A pandas DataFrame with two columns - 'Original String' and 'Modified String'.\n 'Original String' contains the original strings from the input list, and 'Modified String'\n contains the modified strings where a random substring has been replaced.\n\n Requirements:\n - pandas\n - random\n - string\n\n Example:\n >>> f_377(['lamp, bag, mirror', 'table, chair, bag, lamp'])\n Original String Modified String\n 0 lamp, bag, mirror lamp, tkg, mirror\n 1 table, chair, bag, lamp table, chair, bag, kuhm\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n df = pd.DataFrame(data_list, columns=[\"Original String\"])\n\n modified_strings = []\n for s in data_list:\n s = s.strip()\n if not s:\n modified_strings.append(s)\n continue\n substrings = [ss.strip() for ss in s.split(\",\")]\n replace_idx = random.randint(0, len(substrings) - 1)\n random_string = \"\".join(\n random.choices(string.ascii_lowercase, k=len(substrings[replace_idx]))\n )\n substrings[replace_idx] = random_string\n modified_string = \", \".join(substrings)\n modified_strings.append(modified_string)\n\n df[\"Modified String\"] = modified_strings\n\n return df", "test": "import unittest\nimport random\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with a typical input list\n input_data = [\"lamp, bag, mirror\", \"table, chair, bag, lamp\"]\n result = f_377(input_data, seed=0)\n self.assertTrue(all(item in input_data for item in result[\"Original String\"]))\n self.assertNotEqual(\n result[\"Original String\"].tolist(), result[\"Modified String\"].tolist()\n )\n def test_case_2(self):\n # Test with a single-item list\n input_data = [\"lamp, bag, mirror\"]\n result = f_377(input_data, seed=0)\n self.assertTrue(all(item in input_data for item in result[\"Original String\"]))\n self.assertNotEqual(\n result[\"Original String\"].tolist(), result[\"Modified String\"].tolist()\n )\n def test_case_3(self):\n # Test with a list of varied length strings\n input_data = [\"lamp, chair\", \"table, mirror, bag\", \"desk, bed\"]\n result = f_377(input_data, seed=0)\n self.assertTrue(all(item in input_data for item in result[\"Original String\"]))\n self.assertNotEqual(\n result[\"Original String\"].tolist(), result[\"Modified String\"].tolist()\n )\n def test_case_4(self):\n # Test with an empty list\n input_data = []\n result = f_377(input_data, seed=0)\n self.assertEqual(len(result), 0)\n def test_case_5(self):\n # Test with a list of empty strings\n input_data = [\"\", \"\", \"\"]\n result = f_377(input_data, seed=0)\n self.assertEqual(result[\"Original String\"].tolist(), [\"\", \"\", \"\"])\n self.assertEqual(result[\"Modified String\"].tolist(), [\"\", \"\", \"\"])\n def test_case_6(self):\n # Test with strings that have no commas\n input_data = [\"lamps\", \"table\"]\n result = f_377(input_data, seed=1)\n self.assertTrue(\n all(len(modified) == 5 for modified in result[\"Modified String\"])\n )\n def test_case_7(self):\n # Test with strings that contain multiple identical substrings\n input_data = [\"lamp, lamp, lamp\"]\n result = f_377(input_data, seed=2)\n self.assertNotEqual(result[\"Original String\"][0], result[\"Modified String\"][0])\n self.assertTrue(\n any(sub != \"lamp\" for sub in result[\"Modified String\"][0].split(\", \"))\n )\n def test_case_8(self):\n # Test with mixed case input strings\n input_data = [\"Lamp, Bag, Mirror\"]\n result = f_377(input_data, seed=4)\n self.assertNotEqual(\n result[\"Original String\"].tolist(), result[\"Modified String\"].tolist()\n )\n self.assertTrue(\n any(char.islower() for char in result[\"Modified String\"][0])\n ) # Ensure replacement is in lowercase\n def test_case_9(self):\n # Test effect of different seeds on output\n input_data = [\"lamp, bag, mirror\"]\n result_seed_0a = f_377(input_data, seed=0)\n result_seed_0b = f_377(input_data, seed=0)\n result_seed_5 = f_377(input_data, seed=5)\n self.assertEqual(\n result_seed_0a[\"Modified String\"][0], result_seed_0b[\"Modified String\"][0]\n )\n self.assertNotEqual(\n result_seed_0a[\"Modified String\"][0], result_seed_5[\"Modified String\"][0]\n )\n def test_case_10(self):\n # Test case sensitivity\n input_data = [\"Lamp, Bag, Mirror\"]\n result = f_377(input_data, seed=3)\n original_items = [\n item.lower() for item in result[\"Original String\"][0].split(\", \")\n ]\n modified_items = [item for item in result[\"Modified String\"][0].split(\", \")]\n self.assertTrue(\n any(mod_item not in original_items for mod_item in modified_items),\n \"Modified string should contain a lowercase random replacement not present in the original string\",\n )\n def test_case_11(self):\n # Test whitespaces (i.e. make sure leading/trailing whitespaces are removed in processing substrings)\n input_data = [\" lamp, bag ,mirror \"]\n result = f_377(input_data, seed=3)\n modified = result[\"Modified String\"][0].split(\", \")\n self.assertTrue(\n all(item.strip() == item for item in modified),\n \"All items in the modified string should have leading and trailing whitespaces removed\",\n )", "apis": ["pandas.DataFrame", "random.randint", "string.ascii_lowercase", "random.seed", "random.choices"], "libs": ["random", "pandas", "string"], "doc": {"description": ["Replace a random substring (a sequence of characters between two commas or at the beginning/end of the string)", "in a list of strings with a random string (comprising ascii lowercase characters) with the same length as", "the substituted characters."], "note": [], "params": ["data_list (list): Input list of strings.", "Within each string, each substring's leading and trailing whitespaces are removed.", "If empty, it will return a DataFrame with the Original String and Modified String", "columns that is otherwise empty.", "seed (int, optional): The seed for random operations to ensure reproducibility. Defaults to 0."], "returns": ["DataFrame: A pandas DataFrame with two columns - 'Original String' and 'Modified String'.", "'Original String' contains the original strings from the input list, and 'Modified String'", "contains the modified strings where a random substring has been replaced."], "reqs": ["pandas", "random", "string"], "raises": [], "example": [">>> f_377(['lamp, bag, mirror', 'table, chair, bag, lamp'])", "Original String Modified String", "0 lamp, bag, mirror lamp, tkg, mirror", "1 table, chair, bag, lamp table, chair, bag, kuhm"]}} +{"task_id": "f_761", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_761(df, column):\n \"\"\"\n Draw and return a bar chart that shows the distribution of categories in a specific column of a DataFrame.\n \n Note:\n The categories are defined by the constant CATEGORIES, \n which is a list containing ['A', 'B', 'C', 'D', 'E']. If some categories are missing in the DataFrame, \n they will be included in the plot with a count of zero.\n The x label of the plot is set to 'Category', the y label is set to 'Count', and the title is set to 'Distribution of {column}'.\n \n Parameters:\n - df (pandas.DataFrame): The DataFrame to be processed.\n - column (str): The name of the column in the DataFrame that contains the categories.\n \n Output:\n - matplotlib.axes._subplots.Axes: The Axes object for the generated plot.\n \n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> df = pd.DataFrame({'Category': ['A', 'B', 'B', 'C', 'A', 'D', 'E', 'E', 'D']})\n >>> ax = f_761(df, 'Category')\n # This generates and displays a bar chart showing the distribution of each category within the 'Category' column.\n \n >>> df = pd.DataFrame({'Type': ['A', 'A', 'C', 'E', 'D', 'E', 'D']})\n >>> ax = f_761(df, 'Type')\n \"\"\"", "canonical_solution": " # Define the categories\n CATEGORIES = ['A', 'B', 'C', 'D', 'E']\n \n # Count occurrences of each category\n counts = df[column].value_counts()\n missing_categories = list(set(CATEGORIES) - set(counts.index))\n for category in missing_categories:\n counts[category] = 0\n\n counts = counts.reindex(CATEGORIES)\n \n # Plotting\n ax = counts.plot(kind='bar')\n ax.set_xlabel('Category')\n ax.set_ylabel('Count')\n ax.set_title(f'Distribution of {column}')\n plt.show()\n \n return ax", "test": "import unittest\nimport pandas as pd\nfrom matplotlib import pyplot as plt\nclass TestCases(unittest.TestCase):\n \n def test_with_all_categories(self):\n \"\"\"Test with all categories present.\"\"\"\n df = pd.DataFrame({'Category': ['A', 'B', 'B', 'C', 'A', 'D', 'E', 'E', 'D']})\n ax = f_761(df, 'Category')\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_xlabel(), 'Category')\n self.assertEqual(ax.get_ylabel(), 'Count')\n self.assertEqual(ax.get_title(), 'Distribution of Category')\n self.assertEqual(len(ax.get_xticks()), 5) # Check the number of x-axis ticks instead\n def test_with_missing_categories(self):\n \"\"\"Test with some categories missing.\"\"\"\n df = pd.DataFrame({'Category': ['A', 'A', 'B', 'C']})\n ax = f_761(df, 'Category')\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.get_xticks()), 5) # Ensure all categories are accounted for, including missing ones\n def test_with_unexpected_category(self):\n \"\"\"Test with a category not in predefined list.\"\"\"\n df = pd.DataFrame({'Category': ['F', 'A', 'B']}) # 'F' is not a predefined category\n ax = f_761(df, 'Category')\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.get_xticks()), 5) # 'F' is ignored, only predefined categories are considered", "apis": ["matplotlib.pyplot.show"], "libs": ["matplotlib"], "doc": {"description": ["Draw and return a bar chart that shows the distribution of categories in a specific column of a DataFrame.", "Output:", "- matplotlib.axes._subplots.Axes: The Axes object for the generated plot.", ">>> df = pd.DataFrame({'Type': ['A', 'A', 'C', 'E', 'D', 'E', 'D']})", ">>> ax = f_761(df, 'Type')"], "note": ["The categories are defined by the constant CATEGORIES,", "which is a list containing ['A', 'B', 'C', 'D', 'E']. If some categories are missing in the DataFrame,", "they will be included in the plot with a count of zero.", "The x label of the plot is set to 'Category', the y label is set to 'Count', and the title is set to 'Distribution of {column}'."], "params": ["df (pandas.DataFrame): The DataFrame to be processed.", "column (str): The name of the column in the DataFrame that contains the categories."], "returns": [], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({'Category': ['A', 'B', 'B', 'C', 'A', 'D', 'E', 'E', 'D']})", ">>> ax = f_761(df, 'Category')", "# This generates and displays a bar chart showing the distribution of each category within the 'Category' column."]}} +{"task_id": "f_874", "prompt": "import random\nimport string\nimport pandas as pd\n\n\ndef f_874(n_rows=1000):\n \"\"\"\n Generate a histogram of the frequency of the top 30 unique random 3-letter strings.\n The function creates random strings, each consisting of 3 letters from the lowercase English alphabet.\n It then plots a histogram showing the frequencies of the top 30 most common strings among the generated set.\n\n Parameters:\n - n_rows (int): Number of random 3-letter strings to generate.\n Must be positive. Default is 1000.\n\n Returns:\n - ax (matplotlib.axes.Axes): A Matplotlib Axes object containing the histogram.\n Each bar represents one of the top 30 most frequent 3-letter strings.\n\n Raises:\n - ValueError: If `n_rows` is less than or equal to 0.\n\n Requirements:\n - random\n - string\n - pandas\n \n Example:\n >>> ax = f_874(1000)\n >>> ax.get_title()\n 'Top 30 Frequencies of Random 3-Letter Strings'\n \"\"\"", "canonical_solution": " # Check if n_rows is positive\n if n_rows <= 0:\n raise ValueError(\"Number of rows must be greater than 0\")\n\n # Generate random strings\n data = [\"\".join(random.choices(string.ascii_lowercase, k=3)) for _ in range(n_rows)]\n df = pd.DataFrame(data, columns=[\"String\"])\n\n # Aggregate and plot the data\n frequency = df[\"String\"].value_counts()\n ax = frequency.head(30).plot(\n kind=\"bar\"\n ) # Limit to the top 30 frequencies for readability\n ax.set_title(\"Top 30 Frequencies of Random 3-Letter Strings\")\n ax.set_xlabel(\"String\")\n ax.set_ylabel(\"Frequency\")\n\n return ax", "test": "import unittest\nimport random\nfrom matplotlib.axes import Axes\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_874.\"\"\"\n def test_return_type(self):\n \"\"\"Test if the function returns a Matplotlib Axes object.\"\"\"\n random.seed(0)\n result = f_874(100)\n self.assertIsInstance(result, Axes)\n def test_default_parameter(self):\n \"\"\"Test the function with the default parameter.\"\"\"\n result = f_874()\n self.assertIsInstance(result, Axes)\n def test_zero_rows(self):\n \"\"\"Test the function with zero rows.\"\"\"\n with self.assertRaises(ValueError):\n f_874(0)\n def test_negative_rows(self):\n \"\"\"Test the function with a negative number of rows.\"\"\"\n with self.assertRaises(ValueError):\n f_874(-1)\n def test_large_number_of_rows(self):\n \"\"\"Test the function with a large number of rows.\"\"\"\n random.seed(2)\n result = f_874(10000)\n self.assertIsInstance(result, Axes)\n def tearDown(self):\n plt.close()", "apis": ["pandas.DataFrame", "string.ascii_lowercase", "random.choices"], "libs": ["random", "pandas", "string"], "doc": {"description": ["Generate a histogram of the frequency of the top 30 unique random 3-letter strings.", "The function creates random strings, each consisting of 3 letters from the lowercase English alphabet.", "It then plots a histogram showing the frequencies of the top 30 most common strings among the generated set."], "note": [], "params": ["n_rows (int): Number of random 3-letter strings to generate.", "Must be positive. Default is 1000."], "returns": ["ax (matplotlib.axes.Axes): A Matplotlib Axes object containing the histogram.", "Each bar represents one of the top 30 most frequent 3-letter strings."], "reqs": ["random", "string", "pandas"], "raises": ["ValueError: If `n_rows` is less than or equal to 0."], "example": [">>> ax = f_874(1000)", ">>> ax.get_title()", "'Top 30 Frequencies of Random 3-Letter Strings'"]}} +{"task_id": "f_886", "prompt": "import smtplib\nfrom email.message import EmailMessage\nimport getpass\n\nSERVER_ADDRESS = \"localhost\"\nSERVER_PORT = 25\nBUFFER_SIZE = 1024\nSMTP_SERVER = \"smtp.gmail.com\"\nSMTP_PORT = 587\n\n\ndef f_886(client_socket):\n \"\"\"\n Receive a message from a client socket and send it as an email via an SMTP server.\n\n Parameters:\n client_socket (socket.socket): The client socket from which the message is received.\n\n Returns:\n - None\n\n Note:\n - Requires a working internet connection and access to an SMTP server.\n - The function asks for the sender's email, recipient's email,\n and sender's email password for authentication.\n\n Requirements:\n - smtplib\n - email.message.EmailMessage\n - getpass\n\n Example:\n >>> import socket\n >>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n >>> server_socket.bind((SERVER_ADDRESS, SERVER_PORT))\n >>> server_socket.listen(5)\n >>> client_socket, addr = server_socket.accept()\n >>> f_886(client_socket)\n \"\"\"", "canonical_solution": " request = client_socket.recv(BUFFER_SIZE).decode(\"utf-8\")\n print(f\"Received: {request}\")\n\n email = EmailMessage()\n email[\"From\"] = getpass.getpass(\"Email: \")\n email[\"To\"] = getpass.getpass(\"Recipient: \")\n email[\"Subject\"] = \"Message from socket client\"\n email.set_content(request)\n\n with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as smtp:\n smtp.starttls()\n smtp.login(email[\"From\"], getpass.getpass(\"Password: \"))\n smtp.send_message(email)\n\n response = \"Message sent.\"\n client_socket.send(response.encode(\"utf-8\"))\n client_socket.close()", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport smtplib\nfrom email.message import EmailMessage\nimport getpass\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_886\"\"\"\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_successful_email_send(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test if the email is successfully sent with valid inputs.\n \"\"\"\n # Mock behaviors\n mock_socket.return_value.recv.return_value = b\"Test message\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n # Call the function\n f_886(mock_socket())\n # Assertions\n mock_smtp.assert_called_with(\"smtp.gmail.com\", 587)\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_email_with_empty_message(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test behavior when an empty message is received.\n \"\"\"\n # Mock the recv method to return an empty byte string\n mock_socket.return_value.recv.return_value = b\"\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n mock_smtp_instance = MagicMock()\n mock_smtp.return_value = mock_smtp_instance\n client_socket = MagicMock()\n # Simulate the recv and decode behavior by setting the return value of the decode method\n client_socket.recv.return_value.decode.return_value = \"\"\n f_886(client_socket)\n mock_smtp_instance.send_message.assert_not_called()\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_smtp_server_connection_error(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test behavior when there is a network error (e.g., SMTP server unreachable).\n \"\"\"\n # Setup mock for recv to return a valid bytes object\n client_socket = MagicMock()\n client_socket.recv.return_value = b\"Test message\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n mock_smtp.side_effect = smtplib.SMTPConnectError(\n 421, \"Failed to connect to the server\"\n )\n # Expecting an SMTPConnectError\n with self.assertRaises(smtplib.SMTPConnectError):\n f_886(client_socket)\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_socket_closes_after_operation(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test if the socket is properly closed after the operation.\n \"\"\"\n # Setup mock for recv to return a valid bytes object\n client_socket = MagicMock()\n client_socket.recv.return_value = b\"Test message\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n f_886(client_socket)\n # Assert that the socket's close method was called\n client_socket.close.assert_called_once()\n @patch(\"socket.socket\")\n @patch(\"smtplib.SMTP\")\n @patch(\"getpass.getpass\")\n def test_successful_email_dispatch(self, mock_getpass, mock_smtp, mock_socket):\n \"\"\"\n Test if the email is successfully composed and sent with valid inputs.\n \"\"\"\n client_socket = MagicMock()\n client_socket.recv.return_value = b\"Hello, this is a test message.\"\n mock_getpass.side_effect = [\n \"sender@example.com\",\n \"recipient@example.com\",\n \"password\",\n ]\n mock_smtp_instance = MagicMock()\n mock_smtp.return_value = mock_smtp_instance\n f_886(client_socket)\n # Assert that the SMTP instance was created\n mock_smtp.assert_called_with(\"smtp.gmail.com\", 587)\n success_response = \"Message sent.\"\n client_socket.send.assert_called_with(success_response.encode(\"utf-8\"))\n client_socket.close.assert_called_once()", "apis": ["email.message.EmailMessage", "smtplib.SMTP", "getpass.getpass"], "libs": ["getpass", "smtplib", "email"], "doc": {"description": ["Receive a message from a client socket and send it as an email via an SMTP server."], "note": ["Requires a working internet connection and access to an SMTP server.", "The function asks for the sender's email, recipient's email,", "and sender's email password for authentication."], "params": ["client_socket (socket.socket): The client socket from which the message is received."], "returns": ["None"], "reqs": ["smtplib", "email.message.EmailMessage", "getpass"], "raises": [], "example": [">>> import socket", ">>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)", ">>> server_socket.bind((SERVER_ADDRESS, SERVER_PORT))", ">>> server_socket.listen(5)", ">>> client_socket, addr = server_socket.accept()", ">>> f_886(client_socket)"]}} +{"task_id": "f_736", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n# Constants\nARRAY_SIZE = 10000\n\ndef f_736():\n \"\"\"\n Create a numeric array of random integers, calculate the mean and standard deviation, and draw a histogram of the distribution.\n\n Note:\n The random integers are generated between 1 and 100. The title of the histogram is \"Histogram of Random Integers\". \n The x-axis is labeled \"Value\" and the y-axis is labeled \"Frequency\". \n The mean is plotted as a red dashed line, and the standard deviation is plotted as purple dashed lines.\n \n Returns:\n Tuple: A tuple containing the array, mean, standard deviation, and the histogram plot (Axes).\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> import numpy as np\n >>> np.random.seed(0)\n >>> array, mean, std, ax = f_736()\n >>> print(mean, std)\n 49.6135 28.5323416100046\n >>> plt.show()\n \"\"\"", "canonical_solution": " array = np.random.randint(1, 100, size=ARRAY_SIZE)\n mean = np.mean(array)\n std = np.std(array)\n\n fig, ax = plt.subplots()\n ax.hist(array, bins='auto')\n ax.set_title('Histogram of Random Integers')\n ax.set_xlabel('Value')\n ax.set_ylabel('Frequency')\n ax.axvline(mean, color='red', linestyle='dashed', linewidth=1)\n ax.axvline(mean + std, color='purple', linestyle='dashed', linewidth=1)\n ax.axvline(mean - std, color='purple', linestyle='dashed', linewidth=1)\n ax.legend([\"Mean\", \"Standard Deviation\"])\n plt.show()\n \n return array, mean, std, ax", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n np.random.seed(0)\n array, mean, std, ax = f_736()\n self.assertEqual(array.size, ARRAY_SIZE)\n self.assertEqual(mean, 49.6135)\n self.assertEqual(std, 28.5323416100046)\n self.assertEqual(ax.get_title(), 'Histogram of Random Integers')\n def test_case_2(self):\n array, mean, std, ax = f_736()\n self.assertEqual(ax.get_xlabel(), 'Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n def test_case_3(self):\n np.random.seed(1)\n array, mean, std, ax = f_736()\n self.assertEqual(mean, 50.0717)\n self.assertEqual(std, 28.559862729186918)\n def test_case_4(self):\n np.random.seed(100)\n array, mean, std, ax = f_736()\n self.assertEqual(mean, 50.2223)\n self.assertEqual(std, 28.494467580742757)\n def test_case_5(self):\n np.random.seed(500)\n array, mean, std, ax = f_736()\n self.assertEqual(mean, 49.8636)\n self.assertEqual(std, 28.516030492338864)", "apis": ["matplotlib.pyplot.show", "numpy.random.randint", "numpy.random", "numpy.mean", "matplotlib.pyplot.subplots", "numpy.std"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Create a numeric array of random integers, calculate the mean and standard deviation, and draw a histogram of the distribution."], "note": ["The random integers are generated between 1 and 100. The title of the histogram is \"Histogram of Random Integers\".", "The x-axis is labeled \"Value\" and the y-axis is labeled \"Frequency\".", "The mean is plotted as a red dashed line, and the standard deviation is plotted as purple dashed lines."], "params": [], "returns": ["Tuple: A tuple containing the array, mean, standard deviation, and the histogram plot (Axes)."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> import numpy as np", ">>> np.random.seed(0)", ">>> array, mean, std, ax = f_736()", ">>> print(mean, std)", "49.6135 28.5323416100046", ">>> plt.show()"]}} +{"task_id": "f_866", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_866(dataframe):\n \"\"\"\n Calculate the correlation matrix of a DataFrame and plot a scatter plot for the pair of columns with the highest absolute correlation.\n\n Parameters:\n - dataframe (pd.DataFrame): The DataFrame containing numeric columns for correlation calculation.\n\n Returns:\n - ax (plt.Axes): The scatter plot of the pair of columns with the highest absolute correlation.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib\n\n Exception Handling:\n - Raises ValueError if the input DataFrame is empty.\n - Raises TypeError if any column in the DataFrame is non-numeric.\n - Raises ValueError if the DataFrame has fewer than two columns.\n\n Example:\n >>> df = pd.DataFrame({\n ... 'A': np.random.rand(100),\n ... 'B': np.random.rand(100),\n ... 'C': np.random.rand(100)\n ... })\n >>> ax = f_866(df)\n >>> print(ax)\n Axes(0.125,0.11;0.775x0.77)\n \"\"\"", "canonical_solution": "\n if dataframe.empty:\n raise ValueError(\"DataFrame is empty.\")\n \n if not all(dataframe.dtypes.apply(lambda x: np.issubdtype(x, np.number))):\n raise TypeError(\"All columns must be numeric for correlation calculation.\")\n\n if dataframe.shape[1] < 2:\n raise ValueError(\"DataFrame must have at least two columns for correlation calculation.\")\n\n # Explicit use of pd.DataFrame.corr() to calculate the correlation matrix\n corr_matrix = pd.DataFrame.corr(dataframe)\n abs_corr_matrix = corr_matrix.abs()\n\n # Finding the pair of columns with the highest absolute correlation\n highest_corr_value = abs_corr_matrix.unstack().dropna().nlargest(2).iloc[-1]\n max_corr_pair = np.where(abs_corr_matrix == highest_corr_value)\n\n # Extracting column names for the highest correlation\n column_x = dataframe.columns[max_corr_pair[0][0]]\n column_y = dataframe.columns[max_corr_pair[1][0]]\n\n # Using plt to plot the scatter plot\n plt.figure(figsize=(10, 6)) # Creating a figure\n plt.scatter(dataframe[column_x], dataframe[column_y]) # Plotting the scatter plot\n plt.title(f\"Scatter plot between {column_x} and {column_y}\") # Setting the title\n plt.xlabel(column_x) # Setting the x-axis label\n plt.ylabel(column_y) # Setting the y-axis label\n plt.show() # Displaying the figure\n\n return plt.gca() # Returning the current Axes object for further use", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_866.\"\"\"\n def test_high_correlation(self):\n \"\"\"\n Test if the function correctly identifies and plots the pair of columns with the highest positive correlation.\n \"\"\"\n np.random.seed(0) # Set a fixed seed for reproducibility\n df = pd.DataFrame(\n {\"A\": np.arange(100), \"B\": np.arange(100) * 2, \"C\": np.random.rand(100)}\n )\n ax = f_866(df)\n corr = df.corr()\n abs_corr = corr.abs()\n max_corr = abs_corr.unstack().dropna().nlargest(3).iloc[-1]\n expected_pair = np.where(abs_corr == max_corr)\n expected_labels = (\n df.columns[expected_pair[0][0]],\n df.columns[expected_pair[1][0]],\n )\n self.assertEqual((ax.get_xlabel(), ax.get_ylabel()), expected_labels)\n def test_no_correlation(self):\n \"\"\"\n Test if the function handles a case where there is no significant correlation between columns.\n \"\"\"\n np.random.seed(1)\n df = pd.DataFrame(\n {\n \"A\": np.random.rand(100),\n \"B\": np.random.rand(100),\n \"C\": np.random.rand(100),\n }\n )\n ax = f_866(df)\n self.assertIsInstance(ax, plt.Axes)\n def test_negative_correlation(self):\n \"\"\"\n Test if the function correctly identifies and plots the pair of columns with the highest absolute correlation,\n including negative correlations.\n \"\"\"\n np.random.seed(2)\n df = pd.DataFrame(\n {\"A\": np.arange(100), \"B\": np.random.rand(100), \"C\": -np.arange(100) + 50}\n )\n ax = f_866(df)\n corr = df.corr()\n # Get the pair with the highest absolute correlation excluding self-correlations\n abs_corr = corr.abs()\n max_corr = abs_corr.unstack().dropna().nlargest(3).iloc[-1]\n expected_pair = np.where(abs_corr == max_corr)\n expected_labels = (\n df.columns[expected_pair[0][0]],\n df.columns[expected_pair[1][0]],\n )\n self.assertEqual((ax.get_xlabel(), ax.get_ylabel()), expected_labels)\n def test_single_column(self):\n \"\"\"\n Test if the function raises a ValueError when provided with a DataFrame containing only one column.\n \"\"\"\n np.random.seed(3)\n df = pd.DataFrame({\"A\": np.random.rand(100)})\n with self.assertRaises(ValueError):\n f_866(df)\n def test_non_numeric_columns(self):\n \"\"\"\n Test if the function raises a TypeError when provided with a DataFrame containing non-numeric columns.\n \"\"\"\n np.random.seed(4)\n df = pd.DataFrame(\n {\"A\": np.random.rand(100), \"B\": [\"text\"] * 100, \"C\": np.random.rand(100)}\n )\n with self.assertRaises(TypeError):\n f_866(df)\n def test_empty_dataframe(self):\n \"\"\"\n Test if the function raises a ValueError when provided with an empty DataFrame.\n \"\"\"\n df = pd.DataFrame() # Create an empty DataFrame\n with self.assertRaises(ValueError):\n f_866(df)", "apis": ["pandas.DataFrame.corr", "matplotlib.pyplot.figure", "matplotlib.pyplot.show", "pandas.DataFrame", "matplotlib.pyplot.scatter", "numpy.where", "numpy.number", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "numpy.issubdtype", "matplotlib.pyplot.xlabel"], "libs": ["numpy", "pandas", "matplotlib"], "doc": {"description": ["Calculate the correlation matrix of a DataFrame and plot a scatter plot for the pair of columns with the highest absolute correlation.", "Exception Handling:", "- Raises ValueError if the input DataFrame is empty.", "- Raises TypeError if any column in the DataFrame is non-numeric.", "- Raises ValueError if the DataFrame has fewer than two columns."], "note": [], "params": ["dataframe (pd.DataFrame): The DataFrame containing numeric columns for correlation calculation."], "returns": ["ax (plt.Axes): The scatter plot of the pair of columns with the highest absolute correlation."], "reqs": ["pandas", "numpy", "matplotlib"], "raises": [], "example": [">>> df = pd.DataFrame({", "... 'A': np.random.rand(100),", "... 'B': np.random.rand(100),", "... 'C': np.random.rand(100)", "... })", ">>> ax = f_866(df)", ">>> print(ax)", "Axes(0.125,0.11;0.775x0.77)"]}} +{"task_id": "f_353", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import norm\n\ndef f_353(mu=0, sigma=1):\n \"\"\"\n Draw and return a plot of a normal distribution with the given mean and standard deviation,\n utilizing numpy's linspace to create an array of 100 linearly spaced numbers between\n `mu - 3*sigma` and `mu + 3*sigma`.\n\n Parameters:\n mu (float): The mean of the distribution. Default is 0.\n sigma (float): The standard deviation of the distribution. Default is 1.\n\n Returns:\n matplotlib.axes.Axes: The plot representing the normal distribution.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - scipy.stats.norm\n\n Example:\n >>> ax = f_353(mu=5, sigma=2)\n >>> ax\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)\n y = norm.pdf(x, mu, sigma)\n\n fig, ax = plt.subplots()\n ax.plot(x, y)\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test default parameters\n ax = f_353()\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertAlmostEqual(x[np.argmax(y)], 0, delta=0.1)\n self.assertTrue(min(x) >= -3 and max(x) <= 3)\n def test_case_2(self):\n # Test positive mu and sigma with manual calculation\n ax = f_353(mu=5, sigma=2)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n expected_min, expected_max = 5 - 3 * 2, 5 + 3 * 2\n self.assertAlmostEqual(min(x), expected_min, delta=0.1)\n self.assertAlmostEqual(max(x), expected_max, delta=0.1)\n def test_case_3(self):\n # Test negative mu and small sigma\n ax = f_353(mu=-3, sigma=0.5)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertAlmostEqual(x[np.argmax(y)], -3, delta=0.1)\n self.assertTrue(min(x) >= -3 - 1.5 and max(x) <= -3 + 1.5)\n def test_case_4(self):\n # Test large mu and sigma\n mu, sigma = 1e6, 1e5\n ax = f_353(mu=mu, sigma=sigma)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertTrue(\n len(x) > 0 and len(y) > 0,\n \"Plot data should not be empty even for large mu and sigma.\",\n )\n def test_case_5(self):\n # Test negative mu\n ax = f_353(mu=-5, sigma=4)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertAlmostEqual(x[np.argmax(y)], -5, delta=0.15)\n self.assertTrue(min(x) >= -5 - 12 and max(x) <= -5 + 12)\n def test_case_6(self):\n # Test the function with a sigma of 0, which might represent a degenerate distribution\n ax = f_353(mu=0, sigma=0)\n lines = ax.get_lines()\n self.assertEqual(\n len(lines),\n 1,\n \"Plot should contain exactly one line for a degenerate distribution.\",\n )\n def test_case_7(self):\n # Test the function with extremely large values of mu and sigma to ensure it doesn't break\n ax = f_353(mu=1e6, sigma=1e5)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n self.assertTrue(\n len(x) > 0 and len(y) > 0,\n \"Plot data should not be empty even for large mu and sigma.\",\n )\n def test_case_8(self):\n # Test the function with a very small positive sigma to check narrow distributions\n ax = f_353(mu=0, sigma=1e-5)\n lines = ax.get_lines()\n x, y = lines[0].get_data()\n # Checking that the plot peak is at mu and sigma affects the curve's spread.\n self.assertAlmostEqual(\n x[np.argmax(y)],\n 0,\n delta=1e-5,\n msg=\"Peak of the distribution should be at mu.\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "scipy.stats.norm.pdf", "numpy.linspace"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Draw and return a plot of a normal distribution with the given mean and standard deviation,", "utilizing numpy's linspace to create an array of 100 linearly spaced numbers between", "`mu - 3*sigma` and `mu + 3*sigma`."], "note": [], "params": ["mu (float): The mean of the distribution. Default is 0.", "sigma (float): The standard deviation of the distribution. Default is 1."], "returns": ["matplotlib.axes.Axes: The plot representing the normal distribution."], "reqs": ["numpy", "matplotlib.pyplot", "scipy.stats.norm"], "raises": [], "example": [">>> ax = f_353(mu=5, sigma=2)", ">>> ax", "", ">>> type(ax)", ""]}} +{"task_id": "f_851", "prompt": "import requests\nfrom bs4 import BeautifulSoup\nimport pandas as pd\nfrom io import StringIO\n\n\ndef f_851(url, table_id):\n \"\"\"\n Extracts and converts data from a specified HTML table based on the given 'table_id' on a webpage into a Pandas DataFrame.\n If the table is present but contains no data rows (i.e., no tags),\n the function returns an empty DataFrame.\n\n Parameters:\n - url (str): The URL of the webpage from which to extract the table.\n - table_id (str): The 'id' attribute of the HTML table to be extracted.\n\n Returns:\n - df (pd.DataFrame): A DataFrame containing the data extracted from the specified HTML table.\n If the table is found but has no rows ( elements), an empty DataFrame is returned.\n\n Raises:\n - requests.exceptions.HTTPError: If the HTTP request fails (e.g., due to connection issues or\n a non-successful status code like 404 or 500).\n - ValueError: If no table with the specified 'table_id' is found on the webpage. The error message will be\n \"Table with the specified ID not found.\"\n\n Requirements:\n - requests\n - bs4.BeautifulSoup\n - pandas\n - io\n \n Notes:\n - The function raises an HTTPError for unsuccessful HTTP requests, which includes scenarios like\n network problems or non-2xx HTTP responses.\n - A ValueError is raised specifically when the HTML table with the specified ID is not present\n in the webpage's content, indicating either an incorrect ID or the absence of the table.\n - If the located table has no rows, indicated by the absence of tags, an empty DataFrame is returned.\n This is useful for handling tables that are structurally present in the HTML but are devoid of data.\n\n Example:\n >>> f_851('https://example.com/data.html', 'table1')\n DataFrame:\n Name Age\n 0 Alice 25\n 1 Bob 30\n\n Example of ValueError:\n >>> f_851('https://example.com/data.html', 'nonexistent_table')\n ValueError: Table with the specified ID not found.\n\n Example of empty table:\n >>> f_851('https://example.com/emptytable.html', 'empty_table')\n DataFrame:\n Empty DataFrame\n Columns: []\n Index: []\n \"\"\"", "canonical_solution": " try:\n response = requests.get(url, timeout=5)\n response.raise_for_status() # Raises an HTTPError if the HTTP request returned an unsuccessful status code\n except requests.exceptions.HTTPError as e:\n raise e\n\n soup = BeautifulSoup(response.text, \"html.parser\")\n table = soup.find(\"table\", {\"id\": table_id})\n\n if table is None:\n raise ValueError(\"Table with the specified ID not found.\")\n\n # Check if the table is empty (no rows)\n if not table.find_all(\"tr\"):\n return pd.DataFrame()\n\n df = pd.read_html(StringIO(str(table)))[0]\n\n return df", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_851.\"\"\"\n @patch(\"requests.get\")\n def test_successful_scrape(self, mock_get):\n \"\"\"Test a successful scrape.\"\"\"\n mock_html_content = \"\"\"\n \n \n \n \n \n \n
NameAge
Alice25
Bob30
\n \n \n \"\"\"\n # Mock the response\n mock_response = MagicMock()\n mock_response.text = mock_html_content\n mock_get.return_value = mock_response\n # Test\n df = f_851(\"http://example.com\", \"table0\")\n self.assertIsInstance(df, pd.DataFrame)\n self.assertGreater(len(df), 0)\n self.assertIn(\"Name\", df.columns)\n self.assertIn(\"Age\", df.columns)\n @patch(\"requests.get\")\n def test_table_not_found(self, mock_get):\n \"\"\"Test table not found.\"\"\"\n mock_html_content = \"\"\n mock_response = MagicMock()\n mock_response.text = mock_html_content\n mock_get.return_value = mock_response\n # Test\n with self.assertRaises(ValueError):\n f_851(\"http://example.com\", \"non_existent_table\")\n @patch(\"requests.get\")\n def test_network_error(self, mock_get):\n \"\"\"Test network error.\"\"\"\n mock_get.side_effect = requests.exceptions.ConnectionError\n with self.assertRaises(requests.exceptions.ConnectionError):\n f_851(\"http://example.com\", \"table0\")\n @patch(\"requests.get\")\n def test_http_error(self, mock_get):\n \"\"\"Test HTTP error.\"\"\"\n mock_get.return_value.raise_for_status.side_effect = (\n requests.exceptions.HTTPError\n )\n # Test\n with self.assertRaises(requests.exceptions.HTTPError):\n f_851(\"http://example.com\", \"table0\")\n @patch(\"requests.get\")\n def test_empty_table(self, mock_get):\n # Mock HTML content with an empty table\n mock_html_content = \"\"\"\n \n \n
\n \n \n \"\"\"\n # Mock the response\n mock_response = MagicMock()\n mock_response.text = mock_html_content\n mock_get.return_value = mock_response\n # Test\n df = f_851(\"http://example.com\", \"table0\")\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(len(df), 0)", "apis": ["pandas.read_html", "bs4.BeautifulSoup", "io.StringIO", "requests.exceptions", "pandas.DataFrame", "requests.get"], "libs": ["bs4", "pandas", "io", "requests"], "doc": {"description": ["Extracts and converts data from a specified HTML table based on the given 'table_id' on a webpage into a Pandas DataFrame.", "If the table is present but contains no data rows (i.e., no tags),", "the function returns an empty DataFrame.", "Notes:", "- The function raises an HTTPError for unsuccessful HTTP requests, which includes scenarios like", "network problems or non-2xx HTTP responses.", "- A ValueError is raised specifically when the HTML table with the specified ID is not present", "in the webpage's content, indicating either an incorrect ID or the absence of the table.", "- If the located table has no rows, indicated by the absence of tags, an empty DataFrame is returned.", "This is useful for handling tables that are structurally present in the HTML but are devoid of data.", "Example of ValueError:", ">>> f_851('https://example.com/data.html', 'nonexistent_table')", "ValueError: Table with the specified ID not found.", "Example of empty table:", ">>> f_851('https://example.com/emptytable.html', 'empty_table')", "DataFrame:", "Empty DataFrame", "Columns: []", "Index: []"], "note": [], "params": ["url (str): The URL of the webpage from which to extract the table.", "table_id (str): The 'id' attribute of the HTML table to be extracted."], "returns": ["df (pd.DataFrame): A DataFrame containing the data extracted from the specified HTML table.", "If the table is found but has no rows ( elements), an empty DataFrame is returned."], "reqs": ["requests", "bs4.BeautifulSoup", "pandas", "io"], "raises": ["requests.exceptions.HTTPError: If the HTTP request fails (e.g., due to connection issues or", "a non-successful status code like 404 or 500).", "ValueError: If no table with the specified 'table_id' is found on the webpage. The error message will be", "\"Table with the specified ID not found.\""], "example": [">>> f_851('https://example.com/data.html', 'table1')", "DataFrame:", "Name Age", "0 Alice 25", "1 Bob 30"]}} +{"task_id": "f_399", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_399(column, data):\n \"\"\"\n Analyze a list of employee data and calculate statistics for a given column. If the data list is empty,\n the sum will be 0 and mean, min, and max values will be NaN. The function also visualizes the data with\n a pie chart, using the Age column as labels.\n\n Parameters:\n column (str): The column to analyze. Valid values are 'Age', 'Salary', and 'Experience'.\n If invalid, the function will raise KeyError.\n data (list of lists): The employee data, where each list represents [Age, Salary, Experience].\n\n Returns:\n tuple: A tuple containing:\n - dict: A dictionary with the sum, mean, min, and max of the column.\n - Axes object: The pie chart visualizing the column data.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> data = [[25, 50000, 2], [30, 75000, 5], [35, 100000, 7], [40, 125000, 10], [45, 150000, 12]]\n >>> stats, ax = f_399('Salary', data)\n >>> stats\n {'sum': 500000, 'mean': 100000.0, 'min': 50000, 'max': 150000}\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " # Constants encapsulated within the function\n COLUMNS = [\"Age\", \"Salary\", \"Experience\"]\n\n df = pd.DataFrame(data, columns=COLUMNS)\n column_data = df[column]\n\n # Handle empty data\n if df.empty:\n result = {\"sum\": 0, \"mean\": np.nan, \"min\": np.nan, \"max\": np.nan}\n else:\n result = {\n \"sum\": np.sum(column_data),\n \"mean\": np.mean(column_data),\n \"min\": np.min(column_data),\n \"max\": np.max(column_data),\n }\n\n fig, ax = plt.subplots()\n ax.pie(column_data, labels=df[\"Age\"], autopct=\"%1.1f%%\")\n ax.set_title(f\"Pie Chart of {column}\")\n\n return result, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Tests the 'Salary' column with normal data\n data = [\n [25, 50000, 2],\n [30, 75000, 5],\n [35, 100000, 7],\n [40, 125000, 10],\n [45, 150000, 12],\n ]\n stats, ax = f_399(\"Salary\", data)\n self.assertEqual(\n stats, {\"sum\": 500000, \"mean\": 100000.0, \"min\": 50000, \"max\": 150000}\n )\n def test_case_2(self):\n # Tests the 'Experience' column\n data = [\n [26, 52000, 3],\n [31, 76000, 6],\n [36, 101000, 8],\n [41, 126000, 11],\n [46, 151000, 13],\n ]\n stats, ax = f_399(\"Experience\", data)\n self.assertEqual(stats, {\"sum\": 41, \"mean\": 8.2, \"min\": 3, \"max\": 13})\n def test_case_3(self):\n # Tests the 'Age' column\n data = [\n [27, 53000, 4],\n [32, 77000, 7],\n [37, 102000, 9],\n [42, 127000, 12],\n [47, 152000, 14],\n ]\n stats, ax = f_399(\"Age\", data)\n self.assertEqual(stats, {\"sum\": 185, \"mean\": 37.0, \"min\": 27, \"max\": 47})\n def test_case_4(self):\n # Test edge case when data is empty\n data = []\n stats, ax = f_399(\"Salary\", data)\n self.assertEqual(\n stats, {\"sum\": 0, \"mean\": np.nan, \"min\": np.nan, \"max\": np.nan}\n )\n def test_case_5(self):\n # Tests with a single data entry\n data = [[30, 75000, 5]]\n stats, ax = f_399(\"Age\", data)\n self.assertEqual(stats, {\"sum\": 30, \"mean\": 30.0, \"min\": 30, \"max\": 30})\n self.assertTrue(\n isinstance(ax, plt.Axes),\n \"The plotting object is not an instance of matplotlib.axes._axes.Axes\",\n )\n def test_case_6(self):\n # Tests handling of an invalid column name\n data = [[25, 50000, 2], [30, 75000, 5]]\n with self.assertRaises(KeyError):\n f_399(\"InvalidColumn\", data)\n def test_case_7(self):\n # Tests that the pie chart is correctly generated for given data\n data = [\n [25, 50000, 2],\n [30, 75000, 5],\n [35, 100000, 7],\n [40, 125000, 10],\n [45, 150000, 12],\n ]\n _, ax = f_399(\"Salary\", data)\n # Verify the number of pie slices matches the number of data points\n self.assertEqual(\n len(ax.patches),\n len(data),\n \"The number of pie slices does not match the number of data points.\",\n )\n # Optionally, check for the presence of labels (Ages)\n labels = [str(age) for age, _, _ in data] # Extracting age labels from data\n plot_labels = [text.get_text() for text in ax.texts]\n self.assertTrue(\n all(label in plot_labels for label in labels),\n \"Not all expected labels are present in the plot.\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.sum", "pandas.DataFrame", "numpy.mean", "numpy.min", "numpy.nan", "matplotlib.pyplot.subplots", "numpy.max"], "libs": ["pandas", "numpy", "matplotlib"], "doc": {"description": ["Analyze a list of employee data and calculate statistics for a given column. If the data list is empty,", "the sum will be 0 and mean, min, and max values will be NaN. The function also visualizes the data with", "a pie chart, using the Age column as labels."], "note": [], "params": ["column (str): The column to analyze. Valid values are 'Age', 'Salary', and 'Experience'.", "If invalid, the function will raise KeyError.", "data (list of lists): The employee data, where each list represents [Age, Salary, Experience]."], "returns": ["tuple: A tuple containing:", "dict: A dictionary with the sum, mean, min, and max of the column.", "Axes object: The pie chart visualizing the column data."], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [[25, 50000, 2], [30, 75000, 5], [35, 100000, 7], [40, 125000, 10], [45, 150000, 12]]", ">>> stats, ax = f_399('Salary', data)", ">>> stats", "{'sum': 500000, 'mean': 100000.0, 'min': 50000, 'max': 150000}", ">>> type(ax)", ""]}} +{"task_id": "f_612", "prompt": "import os\nimport shutil\nimport glob\n\ndef f_612(source_dir, dest_dir, extension):\n \"\"\"\n Move all files with a particular extension from one directory to another.\n \n Parameters:\n - source_dir (str): The source directory.\n - dest_dir (str): The destination directory.\n - extension (str): The file extension.\n\n Returns:\n - result (int): The count of files that were moved. \n\n Requirements:\n - os\n - shutil\n - glob\n \n Example:\n >>> f_612('path_to_source_dir', 'path_to_dest_dir', '.txt')\n 10\n \"\"\"", "canonical_solution": " files = glob.glob(os.path.join(source_dir, f'*.{extension}'))\n \n for file in files:\n shutil.move(file, dest_dir)\n \n result = len(files)\n\n return result", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n for d in ['./source', './destination', './src', './dst', './s', './d']:\n if os.path.exists(d):\n shutil.rmtree(d)\n def test_case_1(self):\n # Create source directory\n if os.path.exists('./source'):\n shutil.rmtree('./source')\n os.mkdir('./source')\n # Create destination directory\n if os.path.exists('./destination'):\n shutil.rmtree('./destination')\n os.mkdir('./destination')\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./source', filename), 'w') as f:\n f.write('test')\n # Run function\n f_612('./source', './destination', 'txt')\n # Check files\n for d in ['./destination', './source']:\n if d == './source':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx'))) \n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n # Remove files\n shutil.rmtree('./source')\n shutil.rmtree('./destination')\n def test_case_2(self):\n # Create source directory\n if os.path.exists('./src'):\n shutil.rmtree('./src')\n os.mkdir('./src')\n # Create destination directory\n if os.path.exists('./dst'):\n shutil.rmtree('./dst')\n os.mkdir('./dst')\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./src', filename), 'w') as f:\n f.write('test')\n # Run function\n f_612('./src', './dst', 'txt')\n # Check files\n for d in ['./dst', './src']:\n if d == './src':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx'))) \n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n # Remove files\n shutil.rmtree('./src')\n shutil.rmtree('./dst')\n def test_case_3(self):\n # Create source directory\n if os.path.exists('./s'):\n shutil.rmtree('./s')\n os.mkdir('./s')\n # Create destination directory\n if os.path.exists('./d'):\n shutil.rmtree('./d')\n os.mkdir('./d')\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./s', filename), 'w') as f:\n f.write('test')\n # Run function\n f_612('./s', './d', 'txt')\n # Check files\n for d in ['./d', './s']:\n if d == './s':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx'))) \n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n # Remove files\n shutil.rmtree('./s')\n shutil.rmtree('./d')\n def test_case_4(self):\n # Create source directory\n if os.path.exists('./s'):\n shutil.rmtree('./s')\n os.mkdir('./s')\n # Create destination directory\n if os.path.exists('./destination'):\n shutil.rmtree('./destination')\n os.mkdir('./destination')\n # Create files\n for filename in ['bbb.txt', 'a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./s', filename), 'w') as f:\n f.write('test')\n # Run function\n f_612('./s', './destination', 'txt')\n # Check files\n for d in ['./destination', './s']:\n if d == './s':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx'))) \n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n # Remove files\n shutil.rmtree('./s')\n shutil.rmtree('./destination')\n def test_case_5(self):\n # Create source directory\n if os.path.exists('./source'):\n shutil.rmtree('./source')\n os.mkdir('./source')\n # Create destination directory\n if os.path.exists('./d'):\n shutil.rmtree('./d')\n os.mkdir('./d')\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join('./source', filename), 'w') as f:\n f.write('xxx')\n # Run function\n f_612('./source', './d', 'docx')\n # Check files\n for d in ['./d', './source']:\n if d == './source':\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n else:\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx')))\n self.assertFalse(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertFalse(os.path.exists(os.path.join(d, 'a.doc')))\n self.assertFalse(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))", "apis": ["os.path", "glob.glob", "os.path.join", "shutil.move"], "libs": ["os", "glob", "shutil"], "doc": {"description": ["Move all files with a particular extension from one directory to another."], "note": [], "params": ["source_dir (str): The source directory.", "dest_dir (str): The destination directory.", "extension (str): The file extension."], "returns": ["result (int): The count of files that were moved."], "reqs": ["os", "shutil", "glob"], "raises": [], "example": [">>> f_612('path_to_source_dir', 'path_to_dest_dir', '.txt')", "10"]}} +{"task_id": "f_918", "prompt": "import pytz\nfrom dateutil.parser import parse\n\n# Constants\nTIME_FORMAT = \"%d/%m/%y %H:%M:%S.%f\"\n\n\ndef f_918(time_string, from_tz, to_tz):\n \"\"\"\n Converts a time string from one timezone to another, considering various cases such as daylight saving time.\n\n Parameters:\n - time_string (str): A time string in the format 'dd/mm/yy HH:MM:SS.fff'. This string should represent a valid date and time.\n - from_tz (str): The timezone of the given time string. The timezone should be a valid IANA timezone name (e.g., 'UTC', 'America/New_York').\n - to_tz (str): The target timezone to which the time string should be converted. This should also be a valid IANA timezone name (e.g., 'Asia/Tokyo').\n\n Returns:\n - str: The converted time string in the format 'dd/mm/yy HH:MM:SS.fff'. The conversion takes into account any differences in daylight saving rules between the source and target timezones.\n\n Requirements:\n - pytz\n - dateutil\n\n Example:\n >>> f_918('30/03/09 16:31:32.123', 'UTC', 'America/New_York')\n '30/03/09 12:31:32.123000'\n\n Note: The example assumes no daylight saving time shift between the given timezones at the specified date and time.\n \"\"\"", "canonical_solution": " from_zone = pytz.timezone(from_tz)\n to_zone = pytz.timezone(to_tz)\n dt = parse(time_string, dayfirst=True)\n dt = from_zone.localize(dt)\n dt = dt.astimezone(to_zone)\n\n return dt.strftime(TIME_FORMAT)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_918\"\"\"\n def test_utc_to_est(self):\n \"\"\"\n Test conversion from UTC to Eastern Standard Time.\n \"\"\"\n result = f_918(\"30/03/09 16:31:32.123\", \"UTC\", \"America/New_York\")\n expected = \"30/03/09 12:31:32.123000\" # Adjusted for daylight saving time if applicable\n self.assertEqual(result, expected)\n def test_est_to_utc(self):\n \"\"\"\n Test conversion from Eastern Standard Time to UTC.\n \"\"\"\n result = f_918(\"30/03/09 12:31:32.123\", \"America/New_York\", \"UTC\")\n expected = \"30/03/09 16:31:32.123000\" # Adjusted for daylight saving time if applicable\n self.assertEqual(result, expected)\n def test_utc_to_ist(self):\n \"\"\"\n Test conversion from UTC to Indian Standard Time.\n \"\"\"\n result = f_918(\"01/04/09 00:00:00.000\", \"UTC\", \"Asia/Kolkata\")\n expected = \"01/04/09 05:30:00.000000\" # IST is UTC+5:30\n self.assertEqual(result, expected)\n def test_ist_to_utc(self):\n \"\"\"\n Test conversion from Indian Standard Time to UTC.\n \"\"\"\n result = f_918(\"01/04/09 05:30:00.000\", \"Asia/Kolkata\", \"UTC\")\n expected = \"01/04/09 00:00:00.000000\" # IST is UTC+5:30\n self.assertEqual(result, expected)\n def test_utc_to_gmt(self):\n \"\"\"\n Test conversion from UTC to GMT (should be the same).\n \"\"\"\n result = f_918(\"15/04/09 10:30:00.000\", \"UTC\", \"GMT\")\n expected = \"15/04/09 10:30:00.000000\" # GMT and UTC are the same\n self.assertEqual(result, expected)", "apis": ["pytz.timezone", "dateutil.parser.parse"], "libs": ["dateutil", "pytz"], "doc": {"description": ["Converts a time string from one timezone to another, considering various cases such as daylight saving time."], "note": ["The example assumes no daylight saving time shift between the given timezones at the specified date and time."], "params": ["time_string (str): A time string in the format 'dd/mm/yy HH:MM:SS.fff'. This string should represent a valid date and time.", "from_tz (str): The timezone of the given time string. The timezone should be a valid IANA timezone name (e.g., 'UTC', 'America/New_York').", "to_tz (str): The target timezone to which the time string should be converted. This should also be a valid IANA timezone name (e.g., 'Asia/Tokyo')."], "returns": ["str: The converted time string in the format 'dd/mm/yy HH:MM:SS.fff'. The conversion takes into account any differences in daylight saving rules between the source and target timezones."], "reqs": ["pytz", "dateutil"], "raises": [], "example": [">>> f_918('30/03/09 16:31:32.123', 'UTC', 'America/New_York')", "'30/03/09 12:31:32.123000'"]}} +{"task_id": "f_777", "prompt": "import pandas as pd\nimport string\n\ndef f_777(word):\n \"\"\"\n Creates a Pandas DataFrame from a single word, where each row contains a letter from the word \n and its 1-based position in the alphabet.\n\n Requirements:\n - pandas\n - string\n \n Parameters:\n - word (str): The word to create the DataFrame from. The word should be in lowercase and consist of alphabetic characters only.\n \n Returns:\n - pandas.DataFrame: A DataFrame with two columns: 'Letter' and 'Position', \n where 'Position' is the letter's position in the English alphabet.\n \n Examples:\n >>> f_777('abc')\n Letter Position\n 0 a 1\n 1 b 2\n 2 c 3\n\n >>> f_777('zoo')\n Letter Position\n 0 z 26\n 1 o 15\n 2 o 15\n \n Raises:\n - ValueError: If the input word is not in lowercase or contains non-alphabetic characters.\n \"\"\"", "canonical_solution": " if not word: # Check if the input word is empty and return an empty DataFrame\n return pd.DataFrame({'Letter': [], 'Position': []})\n elif not word.isalpha() or not word.islower():\n raise ValueError(\"Input word must be in lowercase alphabetic characters only.\")\n\n alphabet = string.ascii_lowercase\n positions = [alphabet.index(char) + 1 for char in word]\n df = pd.DataFrame({'Letter': list(word), 'Position': positions})\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_abc(self):\n \"\"\"Test with the word 'abc'.\"\"\"\n result = f_777('abc')\n expected = pd.DataFrame({'Letter': ['a', 'b', 'c'], 'Position': [1, 2, 3]})\n pd.testing.assert_frame_equal(result, expected)\n def test_xyz(self):\n \"\"\"Test with the word 'xyz'.\"\"\"\n result = f_777('xyz')\n expected = pd.DataFrame({'Letter': ['x', 'y', 'z'], 'Position': [24, 25, 26]})\n pd.testing.assert_frame_equal(result, expected)\n def test_mixed_case_error(self):\n \"\"\"Test with a mixed case word, expecting a ValueError.\"\"\"\n with self.assertRaises(ValueError):\n f_777('AbC')\n def test_non_alpha_error(self):\n \"\"\"Test with a non-alphabetic word, expecting a ValueError.\"\"\"\n with self.assertRaises(ValueError):\n f_777('123')\n def test_empty_string(self):\n \"\"\"Test with an empty string, expecting an empty DataFrame.\"\"\"\n result = f_777('')\n expected = pd.DataFrame({'Letter': [], 'Position': []})\n pd.testing.assert_frame_equal(result, expected)", "apis": ["pandas.DataFrame", "string.ascii_lowercase"], "libs": ["pandas", "string"], "doc": {"description": ["Creates a Pandas DataFrame from a single word, where each row contains a letter from the word", "and its 1-based position in the alphabet.", ">>> f_777('zoo')", "Letter Position", "0 z 26", "1 o 15", "2 o 15"], "note": [], "params": ["word (str): The word to create the DataFrame from. The word should be in lowercase and consist of alphabetic characters only."], "returns": ["pandas.DataFrame: A DataFrame with two columns: 'Letter' and 'Position',", "where 'Position' is the letter's position in the English alphabet."], "reqs": ["pandas", "string"], "raises": ["ValueError: If the input word is not in lowercase or contains non-alphabetic characters."], "example": ["Examples:", ">>> f_777('abc')", "Letter Position", "0 a 1", "1 b 2", "2 c 3"]}} +{"task_id": "f_337", "prompt": "import numpy as np\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\n\n\ndef f_337(df1, df2, column1=\"feature1\", column2=\"feature2\"):\n \"\"\"Merge datasets, perform KMeans clustering, then return cluster labels and scatterplot.\n\n Each dataset is assumed to contain at least one id column and one feature column. The column to process\n is specified for df1 and df2 via column1 and column2, respectively. KMeans clustering is applied\n with k=2 and n_init=10. Resulting scatterplot shows column1 on the x-axis, column2 on the y-axis,\n and predicted cluster as color.\n\n Parameters:\n - df1 (pd.DataFrame): Dataframe with columns 'id' and feature columns including column1.\n - df2 (pd.DataFrame): Dataframe with columns 'id' and feature columns including column2.\n - column1 (str): Name of column containing features to model in df1. Defaults to \"feature1\".\n - column2 (str): Name of column containing features to model in df2. Defaults to \"feature2\".\n\n Returns:\n - labels (np.ndarray): Cluster labels for each data point (dtype=int32).\n - ax (matplotlib.axes._axes.Axes): The plotted figure's Axes object.\n\n Requirements:\n - sklearn.cluster.KMeans\n - matplotlib.pyplot\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6]})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature2': [2.3, 4.5, 6.7]})\n >>> labels, ax = f_337(df1, df2)\n >>> type(labels)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " df = pd.merge(df1, df2, on=\"id\")\n X = df[[column1, column2]]\n\n kmeans = KMeans(n_clusters=2, n_init=10)\n kmeans.fit(X)\n labels = kmeans.labels_\n\n _, ax = plt.subplots()\n ax.scatter(X[column1], X[column2], c=kmeans.labels_)\n ax.set_xlabel(column1)\n ax.set_ylabel(column2)\n\n return labels, ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Sample dataframes for testing\n self.df1_base = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5], \"feature1\": [1.2, 3.4, 5.6, 7.8, 9.0]}\n )\n self.df2_base = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5], \"feature2\": [2.3, 4.5, 6.7, 8.9, 10.1]}\n )\n def tearDown(self):\n plt.close(\"all\")\n def test_case_1(self):\n # Test scatterplot\n _, ax = f_337(self.df1_base, self.df2_base)\n self.assertIsInstance(ax, matplotlib.axes._axes.Axes)\n self.assertEqual(ax.get_xlabel(), \"feature1\")\n self.assertEqual(ax.get_ylabel(), \"feature2\")\n def test_case_2(self):\n # Expect 2 clusters\n labels, _ = f_337(self.df1_base, self.df2_base)\n self.assertEqual(len(labels), 5)\n self.assertEqual(len(np.unique(labels)), 2)\n def test_case_3(self):\n # Mixed valid data types\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [1, 2, 3]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"feature2\": [1.1, 2.2, 3.3]})\n labels, _ = f_337(df1, df2)\n self.assertEqual(len(labels), 3)\n def test_case_4(self):\n # Partial matches\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [1.2, 3.4, 5.6]})\n df2 = pd.DataFrame({\"id\": [1, 2, 6], \"feature2\": [1.2, 3.1, 6.7]})\n labels, _ = f_337(df1, df2)\n self.assertEqual(len(labels), 2)\n self.assertEqual(len(np.unique(labels)), 2)\n def test_case_5(self):\n # Should fail when there's no matching id\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [1.2, 3.4, 5.6]})\n df2 = pd.DataFrame({\"id\": [4, 5, 6], \"feature2\": [2.3, 4.5, 6.7]})\n with self.assertRaises(ValueError):\n f_337(df1, df2)\n def test_case_6(self):\n # Should fail on non-numeric columns\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [\"a\", \"b\", \"c\"]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"feature2\": [1.1, 2.2, 3.3]})\n with self.assertRaises(Exception):\n f_337(df1, df2)\n def test_case_7(self):\n # Should fail on missing value\n df1 = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5], \"feature1\": [1.2, np.nan, 5.6, 7.8, 9.0]}\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5], \"feature2\": [2.3, 4.5, np.nan, 8.9, 10.1]}\n )\n with self.assertRaises(ValueError):\n f_337(df1, df2)", "apis": ["matplotlib.pyplot.subplots", "sklearn.cluster.KMeans"], "libs": ["sklearn", "matplotlib"], "doc": {"description": ["Merge datasets, perform KMeans clustering, then return cluster labels and scatterplot.", "Each dataset is assumed to contain at least one id column and one feature column. The column to process", "is specified for df1 and df2 via column1 and column2, respectively. KMeans clustering is applied", "with k=2 and n_init=10. Resulting scatterplot shows column1 on the x-axis, column2 on the y-axis,", "and predicted cluster as color."], "note": [], "params": ["df1 (pd.DataFrame): Dataframe with columns 'id' and feature columns including column1.", "df2 (pd.DataFrame): Dataframe with columns 'id' and feature columns including column2.", "column1 (str): Name of column containing features to model in df1. Defaults to \"feature1\".", "column2 (str): Name of column containing features to model in df2. Defaults to \"feature2\"."], "returns": ["labels (np.ndarray): Cluster labels for each data point (dtype=int32).", "ax (matplotlib.axes._axes.Axes): The plotted figure's Axes object."], "reqs": ["sklearn.cluster.KMeans", "matplotlib.pyplot"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6]})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature2': [2.3, 4.5, 6.7]})", ">>> labels, ax = f_337(df1, df2)", ">>> type(labels)", "", ">>> type(ax)", ""]}} +{"task_id": "f_803", "prompt": "import string\nimport random\n\n\ndef f_803(text, seed=None):\n \"\"\"\n Generates a password that mirrors the structure of the given text by replacing alphabetic\n characters with random ascii lowercase letters, digits with random single-digit numbers,\n spaces wth either a random digit or random lowercase letter at equal probabilities, and\n leaving other characters unchanged.\n\n Parameters:\n - text (str): The text to be mirrored in the generated password. Must not be empty.\n - seed (int, optional): Seed for the random number generator. Defaults to None (not set).\n\n Returns:\n - str: The generated password.\n\n Raises:\n - ValueError: If the input text is empty.\n\n Requirements:\n - random\n - string\n\n Note:\n - This function does not handle high Unicode characters and focuses only on ASCII values.\n\n Examples:\n >>> f_803(\"hello world! 123\", 0)\n 'mbqmp3jytre!v553'\n >>> f_803(\"apple321#\", seed=42)\n 'uahev901#'\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n if not text:\n raise ValueError(\"text cannot be empty.\")\n password = \"\"\n for char in text:\n random_lowercase = random.choice(string.ascii_lowercase)\n random_digit = random.choice(string.digits)\n if char.isalpha():\n password += random_lowercase\n elif char.isdigit():\n password += random_digit\n elif char == \" \":\n if random.random() < 0.5:\n password += random_lowercase\n else:\n password += random_digit\n else:\n password += char\n return password", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n result = f_803(\"Hello123\", seed=1)\n self.assertEqual(len(result), 8)\n for i, char in enumerate(\"Hello123\"):\n if char.isalpha():\n self.assertTrue(result[i].isalpha())\n elif char.isdigit():\n self.assertTrue(result[i].isdigit())\n def test_case_2(self):\n # Test basic case with alphabet only\n result = f_803(\"ABC\", seed=2)\n self.assertEqual(len(result), 3)\n self.assertTrue(all(char.isalpha() for char in result))\n def test_case_3(self):\n # Test basic case with digit only\n result = f_803(\"123\", seed=3)\n self.assertEqual(len(result), 3)\n self.assertTrue(all(char.isdigit() for char in result))\n def test_case_4(self):\n # Test basic case with whitespace, alphabet, number, special char\n text = \"Hello, world!\"\n result = f_803(text, seed=4)\n self.assertEqual(len(result), 13)\n for i, char in enumerate(text):\n result_char = result[i]\n if char.isalpha():\n self.assertTrue(result_char.isalpha())\n elif char.isdigit():\n self.assertTrue(result_char.isdigit())\n elif char == \" \":\n self.assertTrue(result_char.isalnum())\n else:\n self.assertEqual(result[i], char)\n def test_case_5(self):\n # Test handling empty string\n with self.assertRaises(Exception):\n f_803(\"\", seed=5)", "apis": ["random.random", "string.ascii_lowercase", "random.seed", "random.choice", "string.digits"], "libs": ["random", "string"], "doc": {"description": ["Generates a password that mirrors the structure of the given text by replacing alphabetic", "characters with random ascii lowercase letters, digits with random single-digit numbers,", "spaces wth either a random digit or random lowercase letter at equal probabilities, and", "leaving other characters unchanged."], "note": ["This function does not handle high Unicode characters and focuses only on ASCII values."], "params": ["text (str): The text to be mirrored in the generated password. Must not be empty.", "seed (int, optional): Seed for the random number generator. Defaults to None (not set)."], "returns": ["str: The generated password."], "reqs": ["random", "string"], "raises": ["ValueError: If the input text is empty."], "example": ["Examples:", ">>> f_803(\"hello world! 123\", 0)", "'mbqmp3jytre!v553'", ">>> f_803(\"apple321#\", seed=42)", "'uahev901#'"]}} +{"task_id": "f_889", "prompt": "from datetime import datetime\nimport numpy as np\nfrom dateutil.parser import parse\n\nLEAP_SECONDS = np.array(\n [\n 1972,\n 1973,\n 1974,\n 1975,\n 1976,\n 1977,\n 1978,\n 1979,\n 1980,\n 1981,\n 1982,\n 1983,\n 1985,\n 1988,\n 1990,\n 1993,\n 1994,\n 1997,\n 1999,\n 2006,\n 2009,\n 2012,\n 2015,\n 2016,\n 2020,\n ]\n)\n\n\ndef f_889(date_str):\n \"\"\"\n Calculate the total number of seconds elapsed from a given date until the current time,\n including any leap seconds that occurred in this period.\n\n Parameters:\n date_str (str): The date and time from which to calculate, in \"yyyy-mm-dd hh:mm:ss\" format.\n\n Returns:\n int: The total number of elapsed seconds, including leap seconds, since the given date.\n\n Requirements:\n - datetime.datetime\n - numpy\n - dateutil.parser.parse\n \n Note:\n This function uses the datetime, numpy, and dateutil.parser modules.\n The LEAP_SECONDS array should contain years when leap seconds were added.\n\n Example:\n >>> total_seconds = f_889('1970-01-01 00:00:00')\n >>> print(total_seconds)\n 1702597276\n \"\"\"", "canonical_solution": " given_date = parse(date_str)\n current_date = datetime.now()\n\n total_seconds = (current_date - given_date).total_seconds()\n\n # Count leap seconds that occurred between the two dates\n leap_seconds = np.sum(LEAP_SECONDS >= given_date.year)\n\n total_seconds += leap_seconds\n\n return int(total_seconds)", "test": "import unittest\nfrom datetime import datetime, timedelta\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_889.\"\"\"\n def test_recent_date(self):\n \"\"\"\n Test the function with a recent date.\n \"\"\"\n test_date = \"2022-01-01 00:00:00\"\n expected_result = (datetime.now() - datetime(2022, 1, 1)).total_seconds()\n expected_result += np.sum(LEAP_SECONDS >= 2022)\n self.assertEqual(f_889(test_date), int(expected_result))\n def test_date_before_leap_seconds(self):\n \"\"\"\n Test the function with a date before the introduction of leap seconds.\n \"\"\"\n test_date = \"1960-01-01 00:00:00\"\n expected_result = (datetime.now() - datetime(1960, 1, 1)).total_seconds()\n expected_result += np.sum(LEAP_SECONDS >= 1960)\n self.assertEqual(f_889(test_date), int(expected_result))\n def test_date_with_leap_second(self):\n \"\"\"\n Test the function with a date in a year when a leap second was added.\n \"\"\"\n test_date = \"2016-01-01 00:00:00\"\n expected_result = (datetime.now() - datetime(2016, 1, 1)).total_seconds()\n expected_result += np.sum(LEAP_SECONDS >= 2016)\n self.assertAlmostEqual(f_889(test_date), int(expected_result), delta=1)\n def test_future_date(self):\n \"\"\"\n Test the function with a future date.\n \"\"\"\n future_date = datetime.now() + timedelta(days=30)\n future_date_str = future_date.strftime(\"%Y-%m-%d %H:%M:%S\")\n result = f_889(future_date_str)\n expected_result = -30 * 24 * 3600 # Negative seconds for future dates\n # Allowing a margin of error of 1 second\n self.assertTrue(abs(result - expected_result) <= 1)\n def test_current_date(self):\n \"\"\"\n Test the function with the current date and time.\n \"\"\"\n current_date_str = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n self.assertEqual(f_889(current_date_str), 0)", "apis": ["numpy.sum", "datetime.datetime.now", "dateutil.parser.parse", "numpy.array"], "libs": ["numpy", "dateutil", "datetime"], "doc": {"description": ["Calculate the total number of seconds elapsed from a given date until the current time,", "including any leap seconds that occurred in this period."], "note": ["This function uses the datetime, numpy, and dateutil.parser modules.", "The LEAP_SECONDS array should contain years when leap seconds were added."], "params": ["date_str (str): The date and time from which to calculate, in \"yyyy-mm-dd hh:mm:ss\" format."], "returns": ["int: The total number of elapsed seconds, including leap seconds, since the given date."], "reqs": ["datetime.datetime", "numpy", "dateutil.parser.parse"], "raises": [], "example": [">>> total_seconds = f_889('1970-01-01 00:00:00')", ">>> print(total_seconds)", "1702597276"]}} +{"task_id": "f_830", "prompt": "import json\nimport pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\nimport matplotlib.pyplot as plt\n\n\ndef f_830(json_data: str, data_key: str):\n \"\"\"\n Processes a JSON string to extract numerical data, Min-Max normalize them,\n and generate a line plot.\n\n Parameters:\n - json_data (str): JSON formatted string containing the data.\n - data_key (str): Dot-separated full key path to access the numerical data within the JSON structure.\n\n Returns:\n - Tuple:\n - pd.Series: Original dataset in float64.\n - pd.Series or None: Dataset after Min-Max scaling in float64, or None if data is empty.\n - plt.Axes or None: Line plot of normalized data, or None if data is empty.\n\n Raises:\n - KeyError: if key path is not found in the given data.\n\n Requirements:\n - json\n - pandas\n - sklearn\n - matplotlib\n\n Notes:\n - The line plot includes labeled axes and a legend. It visualizes the original\n data with label \"Original Data\" and normalized ones as \"Normalized Data\".\n The function sets the plot title to \"Comparison of Original and Normalized Data\",\n with \"Index\" on the x-axis and \"Value\" on the y-axis.\n\n Example:\n >>> json_str = '{\"data\": {\"values\": [5, 10, 15, 20, 25]}}'\n >>> original_data, normalized_data, ax = f_830(json_str, 'data.values')\n >>> type(original_data), type(normalized_data), type(ax)\n (, , )\n \"\"\"", "canonical_solution": " data = json.loads(json_data)\n try:\n data = json.loads(json_data)\n for key in data_key.split(\".\"):\n data = data[key]\n values = pd.Series(data, dtype=pd.Float64Dtype)\n except KeyError:\n raise KeyError(f\"Key path '{data_key}' not found in the provided JSON data.\")\n\n if values.empty:\n return values, None, None\n\n scaler = MinMaxScaler()\n normalized_values = pd.Series(\n scaler.fit_transform(values.values.reshape(-1, 1)).flatten(),\n dtype=pd.Float64Dtype,\n )\n\n fig, ax = plt.subplots()\n ax.plot(values, label=\"Original Data\")\n ax.plot(normalized_values, label=\"Normalized Data\")\n ax.set_title(\"Comparison of Original and Normalized Data\")\n ax.set_xlabel(\"Index\")\n ax.set_ylabel(\"Value\")\n ax.legend()\n\n return values, normalized_values, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_data_extraction(self):\n json_str = '{\"data\": {\"values\": [0.5, 10, 15, 20]}}'\n data_key = \"data.values\"\n original_data, _, _ = f_830(json_str, data_key)\n expected_series = pd.Series([0.5, 10, 15, 20], dtype=pd.Float64Dtype)\n pd.testing.assert_series_equal(original_data, expected_series)\n def test_data_normalization(self):\n json_str = '{\"data\": {\"values\": [0, 10, 20, 30, 40]}}'\n data_key = \"data.values\"\n _, normalized_data, _ = f_830(json_str, data_key)\n expected_normalized = pd.Series(\n [0.0, 0.25, 0.5, 0.75, 1.0], dtype=pd.Float64Dtype\n )\n pd.testing.assert_series_equal(normalized_data, expected_normalized)\n def test_plot_properties(self):\n json_str = '{\"data\": {\"values\": [1, 2, 3, 4, 5]}}'\n data_key = \"data.values\"\n _, _, ax = f_830(json_str, data_key)\n self.assertEqual(ax.get_title(), \"Comparison of Original and Normalized Data\")\n self.assertEqual(ax.get_xlabel(), \"Index\")\n self.assertEqual(ax.get_ylabel(), \"Value\")\n legend_texts = [text.get_text() for text in ax.get_legend().get_texts()]\n self.assertIn(\"Original Data\", legend_texts)\n self.assertIn(\"Normalized Data\", legend_texts)\n def test_empty_data(self):\n json_str = '{\"data\": {\"values\": []}}'\n data_key = \"data.values\"\n original_data, normalized_data, ax = f_830(json_str, data_key)\n self.assertTrue(original_data.empty)\n self.assertIsNone(normalized_data)\n self.assertIsNone(ax)\n def test_non_uniform_data_spacing(self):\n json_str = '{\"data\": {\"values\": [1, 1, 2, 3, 5, 8]}}'\n data_key = \"data.values\"\n _, normalized_data, _ = f_830(json_str, data_key)\n expected_normalized = pd.Series(\n [0.0, 0.0, 0.142857, 0.285714, 0.571429, 1.0], dtype=pd.Float64Dtype\n )\n pd.testing.assert_series_equal(normalized_data, expected_normalized, atol=1e-6)\n def test_negative_values(self):\n json_str = '{\"data\": {\"values\": [-50, -20, 0, 20, 50]}}'\n data_key = \"data.values\"\n _, normalized_data, _ = f_830(json_str, data_key)\n expected_normalized = pd.Series(\n [0.0, 0.3, 0.5, 0.7, 1.0], dtype=pd.Float64Dtype\n )\n pd.testing.assert_series_equal(normalized_data, expected_normalized, atol=1e-5)\n def test_nested_json_structure(self):\n json_str = '{\"data\": {\"deep\": {\"deeper\": {\"values\": [2, 4, 6, 8, 10]}}}}'\n data_key = \"data.deep.deeper.values\"\n original_data, _, _ = f_830(json_str, data_key)\n expected_series = pd.Series([2, 4, 6, 8, 10], dtype=pd.Float64Dtype)\n pd.testing.assert_series_equal(original_data, expected_series)\n def test_complex_json_structure(self):\n json_str = \"\"\"\n {\n \"metadata\": {\n \"source\": \"sensor_array\",\n \"timestamp\": \"2023-04-11\"\n },\n \"readings\": {\n \"temperature\": [20, 22, 21, 23, 24],\n \"humidity\": [30, 32, 31, 33, 34],\n \"data\": {\n \"deep\": {\n \"deeper\": {\n \"values\": [100, 200, 300, 400, 500]\n },\n \"another_level\": {\n \"info\": \"This should not be processed\"\n }\n }\n }\n }\n }\"\"\"\n data_key = \"readings.data.deep.deeper.values\"\n original_data, normalized_data, ax = f_830(json_str, data_key)\n expected_series = pd.Series([100, 200, 300, 400, 500], dtype=pd.Float64Dtype)\n pd.testing.assert_series_equal(original_data, expected_series)\n expected_normalized = pd.Series(\n [0.0, 0.25, 0.5, 0.75, 1.0], dtype=pd.Float64Dtype\n )\n pd.testing.assert_series_equal(normalized_data, expected_normalized, atol=1e-5)\n self.assertIsInstance(ax, plt.Axes)", "apis": ["json.loads", "pandas.Float64Dtype", "pandas.Series", "sklearn.preprocessing.MinMaxScaler", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "json", "pandas", "sklearn"], "doc": {"description": ["Processes a JSON string to extract numerical data, Min-Max normalize them,", "and generate a line plot.", "Notes:", "- The line plot includes labeled axes and a legend. It visualizes the original", "data with label \"Original Data\" and normalized ones as \"Normalized Data\".", "The function sets the plot title to \"Comparison of Original and Normalized Data\",", "with \"Index\" on the x-axis and \"Value\" on the y-axis."], "note": [], "params": ["json_data (str): JSON formatted string containing the data.", "data_key (str): Dot-separated full key path to access the numerical data within the JSON structure."], "returns": ["Tuple:", "pd.Series: Original dataset in float64.", "pd.Series or None: Dataset after Min-Max scaling in float64, or None if data is empty.", "plt.Axes or None: Line plot of normalized data, or None if data is empty."], "reqs": ["json", "pandas", "sklearn", "matplotlib"], "raises": ["KeyError: if key path is not found in the given data."], "example": [">>> json_str = '{\"data\": {\"values\": [5, 10, 15, 20, 25]}}'", ">>> original_data, normalized_data, ax = f_830(json_str, 'data.values')", ">>> type(original_data), type(normalized_data), type(ax)", "(, , )"]}} +{"task_id": "f_903", "prompt": "import numpy as np\nimport random\nimport itertools\nimport pandas as pd\n\n# Constants\nPLANETS = [\n \"Mercury\",\n \"Venus\",\n \"Earth\",\n \"Mars\",\n \"Jupiter\",\n \"Saturn\",\n \"Uranus\",\n \"Neptune\",\n]\nELEMENTS = [\n \"Hydrogen\",\n \"Helium\",\n \"Oxygen\",\n \"Carbon\",\n \"Nitrogen\",\n \"Magnesium\",\n \"Silicon\",\n \"Iron\",\n \"Nickel\",\n]\n\n\ndef f_903():\n \"\"\"\n Generate a DataFrame where each row contains random planet-element pairs.\n Each pair is formatted as 'Planet:Element'. The number of rows is determined by\n the number of planets, and each row will contain as many planet-element pairs as there are elements.\n\n Parameters:\n - None\n\n Returns:\n pandas.DataFrame: A DataFrame where each cell contains a string in the format 'Planet:Element'.\n The DataFrame has a number of rows equal to the number of planets and\n a number of columns equal to the number of elements.\n\n Requirements:\n - numpy\n - random\n - itertools\n - pandas\n\n Example:\n >>> random.seed(0)\n >>> planet_elements_table = f_903()\n >>> planet_elements_table.head(2)\n Hydrogen Helium Oxygen Carbon Nitrogen Magnesium Silicon Iron Nickel\n 0 Uranus:Silicon Earth:Silicon Neptune:Silicon Neptune:Nickel Uranus:Hydrogen Jupiter:Iron Neptune:Nitrogen Earth:Nickel Uranus:Helium\n 1 Venus:Magnesium Saturn:Helium Mars:Nitrogen Mercury:Helium Jupiter:Nitrogen Venus:Oxygen Neptune:Magnesium Mercury:Iron Venus:Helium\n \"\"\"", "canonical_solution": " # Generate all possible pairs\n pairs = [\n f\"{planet}:{element}\"\n for planet, element in itertools.product(PLANETS, ELEMENTS)\n ]\n # Shuffle the pairs to ensure randomness\n random.shuffle(pairs)\n\n # Convert the list of pairs into a numpy array, then reshape it to fit the DataFrame dimensions\n data = np.array(pairs).reshape(len(PLANETS), len(ELEMENTS))\n # Create the DataFrame with ELEMENTS as column headers\n df = pd.DataFrame(data, columns=ELEMENTS)\n\n return df", "test": "import unittest\nimport itertools\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for `f_903`.\"\"\"\n def test_basic_structure(self):\n \"\"\"Test the basic structure of the table.\"\"\"\n random.seed(0)\n table = f_903()\n # Verify the structure of the table\n self.assertEqual(len(table), len(PLANETS))\n self.assertEqual(list(table.columns), ELEMENTS)\n def test_pair_existence(self):\n \"\"\"Test the existence of planet-element pairs.\"\"\"\n random.seed(1)\n table = f_903()\n # Verify all planet-element pairs are present\n all_pairs = set(f\"{p}:{e}\" for p, e in itertools.product(PLANETS, ELEMENTS))\n generated_pairs = set(table.values.flatten())\n self.assertEqual(all_pairs, generated_pairs)\n # Verify no extra pairs are present\n self.assertEqual(len(all_pairs), len(generated_pairs))\n def test_data_type(self):\n \"\"\"Test the data type of the table and its elements.\"\"\"\n random.seed(2)\n table = f_903()\n # Check the data type of the table and its elements\n self.assertIsInstance(table, pd.DataFrame)\n self.assertTrue(all(isinstance(cell, str) for cell in table.values.flatten()))\n def test_data_format(self):\n \"\"\"Test the format of the elements in the table.\"\"\"\n random.seed(3)\n table = f_903()\n # Check the format of the elements in the table\n self.assertTrue(\n all(\n \":\" in cell and len(cell.split(\":\")) == 2\n for cell in table.values.flatten()\n )\n )\n def test_uniqueness(self):\n \"\"\"Test the uniqueness of the pairs.\"\"\"\n random.seed(4)\n table = f_903()\n # Check uniqueness of the pairs\n generated_pairs = table.values.flatten()\n self.assertEqual(len(generated_pairs), len(set(generated_pairs)))", "apis": ["pandas.DataFrame", "itertools.product", "random.shuffle", "numpy.array"], "libs": ["random", "itertools", "numpy", "pandas"], "doc": {"description": ["Generate a DataFrame where each row contains random planet-element pairs.", "Each pair is formatted as 'Planet:Element'. The number of rows is determined by", "the number of planets, and each row will contain as many planet-element pairs as there are elements."], "note": [], "params": ["None"], "returns": ["pandas.DataFrame: A DataFrame where each cell contains a string in the format 'Planet:Element'.", "The DataFrame has a number of rows equal to the number of planets and", "a number of columns equal to the number of elements."], "reqs": ["numpy", "random", "itertools", "pandas"], "raises": [], "example": [">>> random.seed(0)", ">>> planet_elements_table = f_903()", ">>> planet_elements_table.head(2)", "Hydrogen Helium Oxygen Carbon Nitrogen Magnesium Silicon Iron Nickel", "0 Uranus:Silicon Earth:Silicon Neptune:Silicon Neptune:Nickel Uranus:Hydrogen Jupiter:Iron Neptune:Nitrogen Earth:Nickel Uranus:Helium", "1 Venus:Magnesium Saturn:Helium Mars:Nitrogen Mercury:Helium Jupiter:Nitrogen Venus:Oxygen Neptune:Magnesium Mercury:Iron Venus:Helium"]}} +{"task_id": "f_417", "prompt": "from collections import Counter\nimport random\nimport matplotlib.pyplot as plt\n\n\ndef f_417(num_rolls, num_dice, plot_path=None, random_seed=0):\n \"\"\"Simulate rolling a certain number of a standard six-sided dice several times, then\n identify and display the distribution of the sums of the dice rolls in a bar plot.\n\n Parameters:\n - num_rolls (int): The number of times to roll the dice.\n - num_dice (int): The number of dice to roll each time.\n - plot_path (str, optional): Path to save the generated plot. If not provided, plot is not saved.\n - random_seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n tuple: A tuple containing the following elements:\n - Counter: A Counter object with the count of each possible sum.\n - Axes: A matplotlib Axes object representing the bar plot of the Distribution of Dice Roll Sums,\n with Sum of Dice Roll on the x-axis and count on the y-axis.\n\n Requirements:\n - collections.Counter\n - random\n - matplotlib.pyplot\n\n Example:\n >>> result, ax = f_417(10000, 2, 'output.png')\n >>> type(result)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " POSSIBLE_VALUES = list(range(1, 7))\n\n random.seed(random_seed)\n\n sums = []\n for _ in range(num_rolls):\n roll = [random.choice(POSSIBLE_VALUES) for _ in range(num_dice)]\n sums.append(sum(roll))\n\n sums_counter = Counter(sums)\n\n labels, values = zip(*sums_counter.items())\n\n plt.bar(labels, values)\n plt.xlabel(\"Sum of Dice Roll\")\n plt.ylabel(\"Count\")\n plt.title(\"Distribution of Dice Roll Sums\")\n ax = plt.gca()\n if plot_path:\n plt.savefig(plot_path)\n\n return sums_counter, ax", "test": "import unittest\nimport os\nfrom collections import Counter\nimport tempfile\nimport shutil\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Create a temporary directory to store plots\n self.test_dir = tempfile.mkdtemp()\n def tearDown(self):\n # Close matplotlib plots and remove temporary directory\n plt.close(\"all\")\n shutil.rmtree(self.test_dir)\n def test_case_1(self):\n # Test basic functionality with 100 rolls and 2 dice\n result, ax = f_417(100, 2, random_seed=42)\n self.assertIsInstance(result, Counter)\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_2(self):\n # Test plot saving functionality\n plot_path = os.path.join(self.test_dir, \"test_plot.png\")\n result, ax = f_417(1000, 1, plot_path, random_seed=42)\n self.assertIsInstance(result, Counter)\n self.assertTrue(os.path.exists(plot_path))\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_3(self):\n # Test with a larger number of dice\n result, ax = f_417(500, 5, random_seed=42)\n self.assertIsInstance(result, Counter)\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_case_4(self):\n # Test with the minimum possible inputs\n result, ax = f_417(1, 1, random_seed=42)\n self.assertIsInstance(result, Counter)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(len(result), 1) # Only one possible sum with 1 roll of 1 die\n def test_case_5(self):\n # Test the effect of different random seeds on the result consistency\n result1, _ = f_417(100, 2, random_seed=42)\n result2, _ = f_417(100, 2, random_seed=43)\n self.assertNotEqual(\n result1, result2, \"Results should differ with different seeds\"\n )\n def test_case_6(self):\n # Test plot detail correctness (labels, title)\n plot_path = os.path.join(self.test_dir, \"test_plot_detail.png\")\n _, ax = f_417(10, 2, plot_path, random_seed=42)\n self.assertTrue(\n \"sum of dice roll\" in ax.get_xlabel().lower(), \"X-axis label is incorrect\"\n )\n self.assertEqual(ax.get_ylabel(), \"Count\", \"Y-axis label is incorrect\")\n self.assertTrue(\n \"distribution of dice roll sums\" in ax.get_title().lower(),\n \"Plot title is incorrect\",\n )\n def test_case_7(self):\n # Test data correctness with a manually calculated example\n result, _ = f_417(2, 1, random_seed=42)\n expected = Counter({6: 1, 1: 1})\n self.assertEqual(\n result, expected, \"Data distribution does not match expected outcome\"\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.bar", "collections.Counter", "matplotlib.pyplot.savefig", "random.seed", "random.choice", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "matplotlib.pyplot.xlabel"], "libs": ["collections", "random", "matplotlib"], "doc": {"description": ["Simulate rolling a certain number of a standard six-sided dice several times, then", "identify and display the distribution of the sums of the dice rolls in a bar plot."], "note": [], "params": ["num_rolls (int): The number of times to roll the dice.", "num_dice (int): The number of dice to roll each time.", "plot_path (str, optional): Path to save the generated plot. If not provided, plot is not saved.", "random_seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["tuple: A tuple containing the following elements:", "Counter: A Counter object with the count of each possible sum.", "Axes: A matplotlib Axes object representing the bar plot of the Distribution of Dice Roll Sums,", "with Sum of Dice Roll on the x-axis and count on the y-axis."], "reqs": ["collections.Counter", "random", "matplotlib.pyplot"], "raises": [], "example": [">>> result, ax = f_417(10000, 2, 'output.png')", ">>> type(result)", "", ">>> type(ax)", ""]}} +{"task_id": "f_834", "prompt": "import binascii\nimport string\nimport random\n\ndef f_834(length):\n \"\"\"\n Generate a random hexadecimal string of a given length and then attempt to decode it in ASCII.\n The resulting ASCII string may contain non-printable characters\n or be shorter than the input length.\n\n Parameters:\n length (int): The length of the hexadecimal string.\n\n Returns:\n str: The decoded ASCII string.\n\n Requirements:\n - binascii\n - string\n - random\n\n Example:\n >>> random.seed(0)\n >>> f_834(6)\n '\\\\x18'\n >>> f_834(8)\n '\u01a4'\n \"\"\"", "canonical_solution": " HEX_CHARS = string.hexdigits.lower()\n hex_string = \"\".join(random.choice(HEX_CHARS) for _ in range(length))\n return binascii.unhexlify(hex_string).decode(\"utf-8\", \"ignore\")", "test": "import unittest\nimport string\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_834\"\"\"\n def test_correct_length(self):\n \"\"\"Test the length of the hexadecimal string before decoding.\"\"\"\n random.seed(2)\n length = 8\n HEX_CHARS = string.hexdigits.lower()\n hex_string = \"\".join(random.choice(HEX_CHARS) for _ in range(length))\n result = f_834(length)\n # Check if the length of the hexadecimal string before decoding is correct\n self.assertEqual(len(hex_string), length)\n self.assertEqual(result, \"]\")\n def test_correct_type(self):\n \"\"\"Test the type of the output.\"\"\"\n random.seed(4)\n result = f_834(6)\n self.assertIsInstance(result, str)\n self.assertEqual(result, \"y<\")\n def test_non_empty_string_positive_length(self):\n \"\"\"Test the output for a positive length.\"\"\"\n random.seed(6)\n result = f_834(6)\n self.assertNotEqual(result, \"\")\n self.assertEqual(result, \"\\x10\")\n def test_zero_length(self):\n \"\"\"Test the output for a zero length.\"\"\"\n random.seed(8)\n result = f_834(0)\n self.assertEqual(result, \"\")\n def test_negative_length_handling(self):\n \"\"\"Test the output for a negative length.\"\"\"\n random.seed(10)\n result = f_834(-1)\n self.assertEqual(result, \"\")", "apis": ["binascii.unhexlify", "string.hexdigits.lower", "random.choice", "string.hexdigits"], "libs": ["binascii", "random", "string"], "doc": {"description": ["Generate a random hexadecimal string of a given length and then attempt to decode it in ASCII.", "The resulting ASCII string may contain non-printable characters", "or be shorter than the input length."], "note": [], "params": ["length (int): The length of the hexadecimal string."], "returns": ["str: The decoded ASCII string."], "reqs": ["binascii", "string", "random"], "raises": [], "example": [">>> random.seed(0)", ">>> f_834(6)", "'\\\\x18'", ">>> f_834(8)", "'\u01a4'"]}} +{"task_id": "f_791", "prompt": "import numpy as np\nfrom sklearn.preprocessing import MinMaxScaler\n\ndef f_791(rows=3, columns=2, seed=42):\n \"\"\"\n Generate a matrix of random values with specified dimensions and scale it between 0 and 1.\n \n Parameters:\n rows (int): The number of rows for the matrix. Default is 3.\n columns (int): The number of columns for the matrix. Default is 2.\n \n Returns:\n ndarray: A numpy ndarray with scaled values between 0 and 1.\n \n Requirements:\n - numpy\n - sklearn.preprocessing.MinMaxScaler\n \n Example:\n >>> f_791(3, 2)\n array([[0.37939383, 1. ],\n [1. , 0.55700635],\n [0. , 0. ]])\n \n >>> f_791(2, 2)\n array([[0., 1.],\n [1., 0.]])\n \"\"\"", "canonical_solution": " np.random.seed(seed) # Ensure reproducibility for consistent outputs across different runs\n matrix = np.random.rand(rows, columns)\n scaler = MinMaxScaler()\n scaled_matrix = scaler.fit_transform(matrix)\n\n return scaled_matrix", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n result = f_791()\n self.assertEqual(result.shape, (3, 2))\n self.assertTrue(np.all(result >= 0))\n \n def test_case_2(self):\n result = f_791(2, 2)\n self.assertEqual(result.shape, (2, 2))\n self.assertTrue(np.all(result >= 0) and np.all(result <= 1))\n \n def test_case_3(self):\n result = f_791(4, 3)\n self.assertEqual(result.shape, (4, 3))\n self.assertTrue(np.all(result >= 0) and np.all(result <= 1))\n \n def test_case_4(self):\n result = f_791(5, 1)\n self.assertEqual(result.shape, (5, 1))\n self.assertTrue(np.all(result >= 0))\n \n def test_case_5(self):\n result = f_791(1, 5)\n self.assertEqual(result.shape, (1, 5))\n self.assertTrue(np.all(result >= 0) and np.all(result <= 1))", "apis": ["numpy.random", "numpy.random.seed", "numpy.random.rand", "sklearn.preprocessing.MinMaxScaler"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Generate a matrix of random values with specified dimensions and scale it between 0 and 1.", ">>> f_791(2, 2)", "array([[0., 1.],", "[1., 0.]])"], "note": [], "params": ["rows (int): The number of rows for the matrix. Default is 3.", "columns (int): The number of columns for the matrix. Default is 2."], "returns": ["ndarray: A numpy ndarray with scaled values between 0 and 1."], "reqs": ["numpy", "sklearn.preprocessing.MinMaxScaler"], "raises": [], "example": [">>> f_791(3, 2)", "array([[0.37939383, 1. ],", "[1. , 0.55700635],", "[0. , 0. ]])"]}} +{"task_id": "f_394", "prompt": "from datetime import datetime, timedelta\nimport pytz\nimport calendar\n\n\ndef f_394(days_in_past=7):\n \"\"\"\n Get the weekday of the date 'days_in_past' days ago from today.\n\n This function computes the date that is 'days_in_past' number of days ago from the current\n system time's date in UTC. It then determines the weekday of this target date using calendar\n and returns its name as a string.\n\n Parameters:\n days_in_past (int): The number of days to go back from the current date to find the weekday.\n Defaults to 7 (one week ago). Must be a non-negative integer.\n\n Returns:\n weekday (str) : The name of the weekday (e.g., 'Monday', 'Tuesday') for the computed date.\n\n Requirements:\n - datetime.datetime\n - datetime.timedelta\n - pytz\n - calendar\n\n Example:\n >>> f_394()\n 'Monday'\n >>> f_394(3)\n 'Friday'\n \"\"\"", "canonical_solution": " if days_in_past < 0:\n raise ValueError(\"Days in the past cannot be negative\")\n\n date = datetime.now(pytz.UTC) - timedelta(days=days_in_past)\n weekday = calendar.day_name[date.weekday()]\n\n return weekday", "test": "import unittest\nfrom datetime import datetime, timedelta\nimport pytz\nimport calendar\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Input 1: Default input\n result = f_394()\n self.assertIsInstance(result, str)\n self.assertIn(result, list(calendar.day_name))\n # Ensure the result matches the expected output for 7 days ago\n expected_date = datetime.now(pytz.UTC) - timedelta(days=7)\n expected_weekday = calendar.day_name[expected_date.weekday()]\n self.assertEqual(result, expected_weekday)\n def test_case_2(self):\n # Input 2: Test with 3 days in the past\n result = f_394(3)\n self.assertIsInstance(result, str)\n self.assertIn(result, list(calendar.day_name))\n # Ensure the result matches the expected output for 3 days ago\n expected_date = datetime.now(pytz.UTC) - timedelta(days=3)\n expected_weekday = calendar.day_name[expected_date.weekday()]\n self.assertEqual(result, expected_weekday)\n def test_case_3(self):\n # Input 3: Test with 0 days in the past (today)\n result = f_394(0)\n self.assertIsInstance(result, str)\n self.assertIn(result, list(calendar.day_name))\n # Ensure the result matches the expected output for today\n expected_date = datetime.now(pytz.UTC)\n expected_weekday = calendar.day_name[expected_date.weekday()]\n self.assertEqual(result, expected_weekday)\n def test_case_4(self):\n # Input 4: Test with 30 days in the past (approximately a month ago)\n result = f_394(30)\n self.assertIsInstance(result, str)\n self.assertIn(result, list(calendar.day_name))\n # Ensure the result matches the expected output for 30 days ago\n expected_date = datetime.now(pytz.UTC) - timedelta(days=30)\n expected_weekday = calendar.day_name[expected_date.weekday()]\n self.assertEqual(result, expected_weekday)\n def test_case_5(self):\n # Input 5: Test handling invalid days_in_the_past\n for invalid in [-1, \"1\"]:\n with self.assertRaises(Exception):\n f_394(invalid)", "apis": ["datetime.timedelta", "datetime.datetime.now", "calendar.day_name", "pytz.UTC"], "libs": ["calendar", "pytz", "datetime"], "doc": {"description": ["Get the weekday of the date 'days_in_past' days ago from today.", "This function computes the date that is 'days_in_past' number of days ago from the current", "system time's date in UTC. It then determines the weekday of this target date using calendar", "and returns its name as a string."], "note": [], "params": ["days_in_past (int): The number of days to go back from the current date to find the weekday.", "Defaults to 7 (one week ago). Must be a non-negative integer."], "returns": ["weekday (str) : The name of the weekday (e.g., 'Monday', 'Tuesday') for the computed date."], "reqs": ["datetime.datetime", "datetime.timedelta", "pytz", "calendar"], "raises": [], "example": [">>> f_394()", "'Monday'", ">>> f_394(3)", "'Friday'"]}} +{"task_id": "f_928", "prompt": "import pandas as pd\nfrom sklearn.feature_selection import f_oneway\n\ndef f_928(data_file_path: str):\n \"\"\"\n Analyzes numerical data from a CSV file. The function reads the CSV file, converts string representations of\n numbers with commas into floating point numbers, calculates the mean and standard deviation for each numerical column,\n generates a histogram plot for each numerical column, and performs an ANOVA test to check the statistical significance \n of differences between means of numerical columns (if applicable).\n\n Parameters:\n - data_file_path (str): Path to the CSV data file.\n\n Returns:\n - means (pd.Series): Mean values of each numerical column.\n - std_devs (pd.Series): Standard deviation values of each numerical column.\n - axes (list[matplotlib.axes.Axes]): List of histogram plots for each numerical column.\n - anova_results (pd.DataFrame): ANOVA test results for each pair of numerical columns (if more than one numerical column is present).\n\n Requirements:\n - pandas\n - sklearn\n\n Note:\n - The function assumes that all columns in the CSV file contain numerical data or string representations of numerical data.\n - The ANOVA test is only performed if there are two or more numerical columns. Compute two columns \"F-value\" and \"P-value\" for each pair of numerical columns.\n\n Example:\n >>> means, std_devs, axes, anova_results = f_928('data.csv')\n >>> print(f'Means: {means}, Standard Deviations: {std_devs}')\n >>> print(anova_results)\n \"\"\"", "canonical_solution": " df = pd.read_csv(data_file_path)\n # Convert strings with commas to float, if applicable\n for col in df.columns:\n df[col] = pd.to_numeric(df[col].replace(\",\", \"\", regex=True), errors=\"coerce\")\n # drop columns with NaN values\n df = df.dropna(axis=1)\n means = df.mean()\n std_devs = df.std()\n\n # Creating a histogram for each numerical column\n axes = []\n for col in df.columns:\n ax = df[col].hist(bins=50)\n ax.set_title(col)\n axes.append(ax)\n\n plt.show()\n\n # ANOVA Test if more than one numerical column\n anova_results = None\n if len(df.columns) > 1:\n anova_results = pd.DataFrame(f_oneway(*[df[col] for col in df.columns if df[col].dtype != 'object']),\n index=['F-value', 'P-value'], \n columns=['ANOVA Results'])\n\n return means, std_devs, axes, anova_results", "test": "import unittest\nfrom unittest.mock import patch\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_928\"\"\"\n @patch(\"pandas.read_csv\")\n def test_empty_file(self, mock_read_csv):\n \"\"\"\n Test the function with an empty CSV file.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame()\n means, std_devs, axes, anova_results = f_928(\"empty.csv\")\n self.assertTrue(means.empty)\n self.assertTrue(std_devs.empty)\n self.assertEqual(len(axes), 0)\n self.assertIsNone(anova_results)\n @patch(\"pandas.read_csv\")\n def test_single_column(self, mock_read_csv):\n \"\"\"\n Test the function with a CSV file having a single numerical column.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"A\": [1, 2, 3, 4, 5]})\n means, std_devs, axes, anova_results = f_928(\"single_column.csv\")\n self.assertEqual(means[\"A\"], 3)\n self.assertAlmostEqual(std_devs[\"A\"], 1.5811, places=4)\n self.assertEqual(len(axes), 1)\n self.assertIsNone(anova_results)\n @patch(\"pandas.read_csv\")\n def test_multiple_columns(self, mock_read_csv):\n \"\"\"\n Test the function with a CSV file having multiple numerical columns.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n means, _, axes, anova_results = f_928(\"multiple_columns.csv\")\n self.assertEqual(means[\"A\"], 2)\n self.assertEqual(means[\"B\"], 5)\n self.assertEqual(len(axes), 2)\n self.assertEqual(anova_results[\"ANOVA Results\"][\"F-value\"], 13.5)\n self.assertAlmostEqual(anova_results[\"ANOVA Results\"][\"P-value\"], 0.021312, places=5)\n \n @patch(\"pandas.read_csv\")\n def test_numerical_and_non_numerical_columns(self, mock_read_csv):\n \"\"\"\n Test the function with a mix of numerical and non-numerical columns.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [\"a\", \"b\", \"c\"]})\n means, std_devs, axes, anova_results = f_928(\"mixed_columns.csv\")\n self.assertEqual(len(means), 1) # Only one numerical column\n self.assertEqual(len(std_devs), 1)\n self.assertEqual(len(axes), 1)\n self.assertIsNone(anova_results)\n @patch(\"pandas.read_csv\")\n def test_with_special_characters(self, mock_read_csv):\n \"\"\"\n Test the function with a CSV file containing numbers with special characters (e.g., commas).\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"A\": [\"1,000\", \"2,000\", \"3,000\"]})\n means, std_devs, axes, anova_results = f_928(\"special_characters.csv\")\n self.assertAlmostEqual(means[\"A\"], 2000, places=0)\n self.assertAlmostEqual(std_devs[\"A\"], pd.Series([1000, 2000, 3000]).std(), places=0)\n self.assertEqual(len(axes), 1)\n self.assertIsNone(anova_results)\n def tearDown(self):\n plt.close()", "apis": ["pandas.read_csv", "pandas.DataFrame", "sklearn.feature_selection.f_oneway", "pandas.to_numeric"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Analyzes numerical data from a CSV file. The function reads the CSV file, converts string representations of", "numbers with commas into floating point numbers, calculates the mean and standard deviation for each numerical column,", "generates a histogram plot for each numerical column, and performs an ANOVA test to check the statistical significance", "of differences between means of numerical columns (if applicable)."], "note": ["The function assumes that all columns in the CSV file contain numerical data or string representations of numerical data.", "The ANOVA test is only performed if there are two or more numerical columns. Compute two columns \"F-value\" and \"P-value\" for each pair of numerical columns."], "params": ["data_file_path (str): Path to the CSV data file."], "returns": ["means (pd.Series): Mean values of each numerical column.", "std_devs (pd.Series): Standard deviation values of each numerical column.", "axes (list[matplotlib.axes.Axes]): List of histogram plots for each numerical column.", "anova_results (pd.DataFrame): ANOVA test results for each pair of numerical columns (if more than one numerical column is present)."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> means, std_devs, axes, anova_results = f_928('data.csv')", ">>> print(f'Means: {means}, Standard Deviations: {std_devs}')", ">>> print(anova_results)"]}} +{"task_id": "f_794", "prompt": "import pandas as pd\nimport numpy as np\nimport random\nfrom random import randint, seed\n\n# Constants\nCATEGORIES = ['Electronics', 'Clothing', 'Home & Kitchen', 'Books', 'Toys & Games']\n\ndef f_794(mystrings, n_products, seed=0):\n \"\"\"\n Create a product catalog DataFrame where each row represents a product with the following columns:\n - 'Product Name': The name of the product with spaces replaced by underscores.\n - 'Category': The category to which the product belongs.\n - 'Price': The price of the product, generated randomly based on a normal distribution with a mean of 50 and a standard deviation of 10.\n \n Parameters:\n mystrings (list of str): List of product names.\n n_products (int): Number of products to generate in the catalog.\n\n Returns:\n pd.DataFrame: A pandas DataFrame containing the product catalog information.\n\n Requirements:\n - pandas\n - numpy\n - random.randint\n - random.seed\n\n Constants:\n - CATEGORIES: A list of categories used to randomly assign a category to each product.\n\n Examples:\n >>> f_794(['Mobile Phone', 'T Shirt', 'Coffee Maker', 'Python Book', 'Toy Car'], 2)\n Product Name Category Price\n 0 Python_Book Books 67.64\n 1 Mobile_Phone Home & Kitchen 54.00\n >>> f_794(['Laptop', 'Sweater'], 1)\n Product Name Category Price\n 0 Sweater Books 67.64\n \"\"\"", "canonical_solution": " catalogue_data = []\n random.seed(seed)\n np.random.seed(seed)\n for _ in range(n_products):\n product_name = mystrings[randint(0, len(mystrings) - 1)].replace(' ', '_')\n category = CATEGORIES[randint(0, len(CATEGORIES) - 1)]\n price = round(np.random.normal(50, 10), 2)\n catalogue_data.append([product_name, category, price])\n\n catalogue_df = pd.DataFrame(catalogue_data, columns=['Product Name', 'Category', 'Price'])\n\n return catalogue_df", "test": "import unittest\nfrom pandas.testing import assert_frame_equal\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n \n result = f_794(['Mobile Phone', 'T Shirt', 'Coffee Maker', 'Python Book', 'Toy Car'], 2, 42)\n # assert the value of the DataFrame\n self.assertEqual(result['Product Name'].tolist(), ['Mobile_Phone', 'Coffee_Maker'])\n self.assertEqual(result['Category'].tolist(), ['Electronics', 'Clothing'])\n self.assertEqual(result['Price'].tolist(), [54.97, 48.62])\n \n def test_case_2(self):\n result = f_794(['Laptop', 'Sweater'], 1)\n self.assertEqual(result['Product Name'].tolist(), ['Sweater'])\n self.assertEqual(result['Category'].tolist(), ['Books'])\n self.assertEqual(result['Price'].tolist(), [67.64])\n \n def test_case_3(self):\n result = f_794(['Book', 'Pen', 'Bag'], 3)\n self.assertEqual(result['Product Name'].tolist(), ['Pen', 'Book', 'Bag'])\n self.assertEqual(result['Category'].tolist(), ['Books', 'Home & Kitchen', 'Books'])\n self.assertEqual(result['Price'].tolist(), [67.64, 54.00, 59.79])\n \n def test_case_4(self):\n result = f_794(['Watch'], 2)\n self.assertEqual(result['Product Name'].tolist(), ['Watch', 'Watch'])\n self.assertEqual(result['Category'].tolist(), ['Books', 'Home & Kitchen'])\n self.assertEqual(result['Price'].tolist(), [67.64, 54.00])\n def test_case_5(self):\n result = f_794(['TV', 'Fridge', 'Sofa', 'Table'], 0)\n self.assertEqual(result.empty, True)", "apis": ["pandas.DataFrame", "numpy.random", "random.randint", "random.seed", "numpy.random.normal", "numpy.random.seed"], "libs": ["random", "numpy", "pandas"], "doc": {"description": ["Create a product catalog DataFrame where each row represents a product with the following columns:", "- 'Product Name': The name of the product with spaces replaced by underscores.", "- 'Category': The category to which the product belongs.", "- 'Price': The price of the product, generated randomly based on a normal distribution with a mean of 50 and a standard deviation of 10.", "Constants:", "- CATEGORIES: A list of categories used to randomly assign a category to each product."], "note": [], "params": ["mystrings (list of str): List of product names.", "n_products (int): Number of products to generate in the catalog."], "returns": ["pd.DataFrame: A pandas DataFrame containing the product catalog information."], "reqs": ["pandas", "numpy", "random.randint", "random.seed"], "raises": [], "example": ["Examples:", ">>> f_794(['Mobile Phone', 'T Shirt', 'Coffee Maker', 'Python Book', 'Toy Car'], 2)", "Product Name Category Price", "0 Python_Book Books 67.64", "1 Mobile_Phone Home & Kitchen 54.00", ">>> f_794(['Laptop', 'Sweater'], 1)", "Product Name Category Price", "0 Sweater Books 67.64"]}} +{"task_id": "f_335", "prompt": "import pandas as pd\nimport seaborn as sns\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_335(df1, df2):\n \"\"\"\n Merge two dataframes on the 'id' column and then scale the numeric features.\n\n This function merges two dataframes via outer join on the 'id' column, and scales the merged dataframe's\n numeric features from df1 to have a mean of 0 and standard deviation of 1. It also returns a pair plot of\n the scaled features from df1.\n\n Parameters:\n - df1 (pd.DataFrame): Left dataframe to merge into.\n - df2 (pd.DataFrame): Right dataframe to merge from.\n\n Returns:\n - merged_df (pd.DataFrame): The partially scaled and merged dataframe.\n - pair_plot (seaborn.axisgrid.PairGrid): Pair plot of the scaled dataframe.\n\n Requirements:\n - pandas\n - sklearn\n - seaborn\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7]})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature4': [4.5, 6.7, 8.9], 'feature5': [5.6, 7.8, 9.0]})\n >>> scaled_df, plot = f_335(df1, df2)\n >>> scaled_df\n id feature1 feature2 feature4 feature5\n 0 1 -1.224745 -1.224745 4.5 5.6\n 1 2 0.000000 0.000000 6.7 7.8\n 2 3 1.224745 1.224745 8.9 9.0\n >>> type(scaled_df)\n \n >>> type(plot)\n \n \"\"\"", "canonical_solution": " merged_df = pd.merge(df1, df2, on=\"id\", how=\"outer\")\n\n # Select only numeric columns from df1 (excluding 'id')\n numeric_features_df1 = df1.select_dtypes(\n include=[\"float64\", \"int64\"]\n ).columns.tolist()\n if \"id\" in numeric_features_df1:\n numeric_features_df1.remove(\"id\")\n\n # Scale only the numeric features of df1\n if not merged_df.empty and numeric_features_df1:\n scaler = StandardScaler()\n merged_df[numeric_features_df1] = scaler.fit_transform(\n merged_df[numeric_features_df1]\n )\n\n # Pair plot only for the numeric features of df1\n pair_plot = None\n if numeric_features_df1:\n pair_plot = sns.pairplot(merged_df[numeric_features_df1])\n\n return merged_df, pair_plot", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Standard data merging on 'id' and checking scaled values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1.2, 3.4, 5.6],\n \"feature2\": [2.3, 4.5, 6.7],\n \"feature3\": [3.4, 5.6, 7.8],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3], \"feature4\": [4.5, 6.7, 8.9], \"feature5\": [5.6, 7.8, 9.0]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertEqual(\n list(scaled_df.columns),\n [\"id\", \"feature1\", \"feature2\", \"feature3\", \"feature4\", \"feature5\"],\n )\n self.assertAlmostEqual(scaled_df[\"feature1\"].mean(), 0, places=5)\n def test_case_2(self):\n # Random data merging and checking scaled values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 3, 5],\n \"feature1\": [10, 20, 30],\n \"feature2\": [5, 15, 25],\n \"feature3\": [6, 16, 26],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 5, 3], \"feature4\": [7, 17, 27], \"feature5\": [8, 18, 28]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertAlmostEqual(scaled_df[\"feature2\"].std(), 1.224745, places=5)\n def test_case_3(self):\n # Negative values and merging on 'id' and checking scaled values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [-1, -2, -3],\n \"feature2\": [-5, -6, -7],\n \"feature3\": [-8, -9, -10],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3], \"feature4\": [-11, -12, -13], \"feature5\": [-14, -15, -16]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertAlmostEqual(scaled_df[\"feature3\"].max(), 1.224745, places=5)\n def test_case_4(self):\n # Zero values and checking if scaled values remain zero\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3, 4],\n \"feature1\": [0, 0, 0, 0],\n \"feature2\": [0, 0, 0, 0],\n \"feature3\": [0, 0, 0, 0],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3, 4], \"feature4\": [0, 0, 0, 0], \"feature5\": [0, 0, 0, 0]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertAlmostEqual(scaled_df[\"feature1\"].min(), 0, places=5)\n def test_case_5(self):\n # Large values and checking scaled min values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2],\n \"feature1\": [1000, 2000],\n \"feature2\": [500, 1500],\n \"feature3\": [100, 200],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2], \"feature4\": [10, 20], \"feature5\": [1, 2]})\n scaled_df, _ = f_335(df1, df2)\n self.assertAlmostEqual(scaled_df[\"feature2\"].min(), -1, places=5)\n def test_case_6(self):\n # Testing the plot's attributes\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, 3],\n \"feature2\": [4, 5, 6],\n \"feature3\": [7, 8, 9],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3], \"feature4\": [10, 11, 12], \"feature5\": [13, 14, 15]}\n )\n _, pair_plot = f_335(df1, df2)\n # Checking if the pair plot has the expected attributes\n self.assertEqual(\n len(pair_plot.axes), 3\n ) # Because we have 3 valid features in df1\n self.assertIn(\"feature1\", pair_plot.data.columns)\n self.assertIn(\"feature2\", pair_plot.data.columns)\n self.assertIn(\"feature3\", pair_plot.data.columns)\n def test_case_7(self):\n # Testing with empty dataframes\n df1 = pd.DataFrame(columns=[\"id\", \"feature1\", \"feature2\", \"feature3\"])\n df2 = pd.DataFrame(columns=[\"id\", \"feature4\", \"feature5\"])\n scaled_df, _ = f_335(df1, df2)\n self.assertTrue(scaled_df.empty)\n def test_case_8(self):\n # Testing with NaN values in the dataframes\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, None],\n \"feature2\": [4, None, 6],\n \"feature3\": [7, 8, 9],\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3], \"feature4\": [10, 11, 12], \"feature5\": [13, 14, 15]}\n )\n scaled_df, _ = f_335(df1, df2)\n self.assertTrue(scaled_df.isnull().any().any()) # Checking if NaN values exist\n def tearDown(self):\n plt.close(\"all\")", "apis": ["sklearn.preprocessing.StandardScaler", "pandas.merge", "seaborn.pairplot"], "libs": ["seaborn", "pandas", "sklearn"], "doc": {"description": ["Merge two dataframes on the 'id' column and then scale the numeric features.", "This function merges two dataframes via outer join on the 'id' column, and scales the merged dataframe's", "numeric features from df1 to have a mean of 0 and standard deviation of 1. It also returns a pair plot of", "the scaled features from df1."], "note": [], "params": ["df1 (pd.DataFrame): Left dataframe to merge into.", "df2 (pd.DataFrame): Right dataframe to merge from."], "returns": ["merged_df (pd.DataFrame): The partially scaled and merged dataframe.", "pair_plot (seaborn.axisgrid.PairGrid): Pair plot of the scaled dataframe."], "reqs": ["pandas", "sklearn", "seaborn"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7]})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature4': [4.5, 6.7, 8.9], 'feature5': [5.6, 7.8, 9.0]})", ">>> scaled_df, plot = f_335(df1, df2)", ">>> scaled_df", "id feature1 feature2 feature4 feature5", "0 1 -1.224745 -1.224745 4.5 5.6", "1 2 0.000000 0.000000 6.7 7.8", "2 3 1.224745 1.224745 8.9 9.0", ">>> type(scaled_df)", "", ">>> type(plot)", ""]}} +{"task_id": "f_536", "prompt": "import os\nimport random\n\ndef f_536(directory, n_files):\n \"\"\"\n Create n random text files in a specific directory, write a random string to each file, and then reset the cursor to the beginning of each file.\n\n Parameters:\n - directory (str): The directory in which to generate the files.\n - n_files (int): The number of files to generate.\n\n Returns:\n - directory (str): The directory in which the files were generated.\n\n Requirements:\n - os\n - random\n\n Example:\n >>> f_536('/path/to/directory', 5)\n '/path/to/directory'\n \"\"\"", "canonical_solution": " if not os.path.exists(directory):\n os.makedirs(directory)\n\n for i in range(n_files):\n filename = os.path.join(directory, f\"file_{i+1}.txt\")\n\n with open(filename, 'w') as file:\n file.write(str(random.randint(1, 100)))\n file.seek(0)\n\n return directory", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def setUp(self):\n random.seed(42)\n \n def tearDown(self):\n shutil.rmtree('./source', ignore_errors=True)\n shutil.rmtree('./src', ignore_errors=True)\n shutil.rmtree('./s', ignore_errors=True)\n \n def test_case_1(self):\n directory = f_536('./source', 10)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 10)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt')\n \n def test_case_2(self):\n directory = f_536('./src', 1)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 1)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt') \n \n def test_case_3(self):\n directory = f_536('./s', 100)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 100)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt') \n \n def test_case_4(self):\n directory = f_536('./s', 0)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 0)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt') \n \n def test_case_5(self):\n directory = f_536('./source', 1)\n self.assertTrue(os.path.exists(directory))\n self.assertEqual(len(os.listdir(directory)), 1)\n for file in os.listdir(directory):\n self.assertEqual(file.split('.')[-1], 'txt')", "apis": ["os.makedirs", "random.randint", "os.path", "os.path.join", "os.path.exists"], "libs": ["os", "random"], "doc": {"description": ["Create n random text files in a specific directory, write a random string to each file, and then reset the cursor to the beginning of each file."], "note": [], "params": ["directory (str): The directory in which to generate the files.", "n_files (int): The number of files to generate."], "returns": ["directory (str): The directory in which the files were generated."], "reqs": ["os", "random"], "raises": [], "example": [">>> f_536('/path/to/directory', 5)", "'/path/to/directory'"]}} +{"task_id": "f_790", "prompt": "import numpy as np\nimport random\nfrom datetime import datetime\n\ndef f_790(rows=3, columns=2, start_date=datetime(2021, 1, 1), end_date=datetime(2021, 12, 31), seed=0):\n \"\"\"\n Generate a matrix with unique dates between a given start and end date.\n \n Functionality:\n This function generates a matrix of given dimensions (rows x columns) containing unique dates between \n a specified start date and end date.\n \n Input:\n - rows (int): The number of rows for the output matrix. Default is 3.\n - columns (int): The number of columns for the output matrix. Default is 2.\n - start_date (datetime): The start date for the range of unique dates. Default is datetime(2021, 1, 1).\n - end_date (datetime): The end date for the range of unique dates. Default is datetime(2021, 12, 31).\n \n Output to be returned:\n - ndarray: A numpy ndarray with unique dates in the shape (rows, columns).\n \n Requirements:\n - numpy\n - itertools\n - datetime\n - random\n \n Example:\n >>> matrix = f_790(2, 2, datetime(2021, 1, 1), datetime(2021, 1, 10))\n >>> print(matrix)\n [['2021-01-03T00:00:00.000000000', '2021-01-07T00:00:00.000000000'],\n ['2021-01-09T00:00:00.000000000', '2021-01-04T00:00:00.000000000']]\n \"\"\"", "canonical_solution": " # Convert start_date and end_date to numpy datetime64 objects\n if seed is not None:\n random.seed(seed)\n \n # Convert start_date and end_date to numpy datetime64 objects\n start_date_np = np.datetime64(start_date)\n end_date_np = np.datetime64(end_date)\n\n # Calculate the number of days between start_date and end_date\n total_days = int((end_date_np - start_date_np).astype('timedelta64[D]').astype(int) + 1)\n\n # Randomly select unique dates within the range without replacement using random.sample\n selected_dates = sorted(random.sample(range(total_days), rows * columns))\n\n # Generate the matrix with selected unique dates\n matrix = (start_date_np + np.array(selected_dates).astype('timedelta64[D]')).reshape(rows, columns)\n\n return matrix", "test": "# Unit testing\nimport unittest\nimport numpy.testing as npt\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n # Using default parameters\n matrix = f_790(seed=0)\n self.assertEqual(matrix.shape, (3, 2))\n self.assertTrue(np.all(np.diff(matrix.ravel()).astype(int) > 0)) # Dates should be unique\n def test_case_2(self):\n # Using custom rows and columns, and a small date range\n matrix = f_790(2, 2, datetime(2021, 1, 1), datetime(2021, 1, 10), seed=42)\n self.assertEqual(matrix.shape, (2, 2))\n self.assertTrue(np.all(np.diff(matrix.ravel()).astype(int) >= 0)) # Dates should be unique\n def test_case_3(self):\n # Using custom rows and columns, and a large date range\n matrix = f_790(4, 4, datetime(2000, 1, 1), datetime(2021, 12, 31), seed=55)\n self.assertEqual(matrix.shape, (4, 4))\n self.assertTrue(np.all(np.diff(matrix.ravel()).astype(int) >= 0)) # Dates should be unique\n def test_case_4(self):\n # Using a date range of one day\n matrix = f_790(1, 1, datetime(2021, 1, 1), datetime(2021, 1, 1), seed=0)\n expected_date = np.array(['2021-01-01'], dtype='datetime64[us]').reshape(1, 1)\n npt.assert_array_equal(matrix, expected_date) # Only one date in the range\n def test_case_5(self):\n # Using custom rows and columns, and a date range with only two days\n matrix = f_790(1, 2, datetime(2021, 1, 1), datetime(2021, 1, 2), seed=41)\n self.assertEqual(matrix.shape, (1, 2))\n self.assertTrue(np.all(np.diff(matrix.ravel()).astype(int) >= 0)) # Dates should be unique\n expected_dates = np.array(['2021-01-01', '2021-01-02'], dtype='datetime64[us]').reshape(1, 2)\n for date in expected_dates.ravel():\n self.assertIn(date, matrix.ravel())", "apis": ["numpy.array", "numpy.datetime64", "random.seed", "random.sample", "datetime.datetime"], "libs": ["random", "numpy", "datetime"], "doc": {"description": ["Generate a matrix with unique dates between a given start and end date.", "Functionality:", "This function generates a matrix of given dimensions (rows x columns) containing unique dates between", "a specified start date and end date.", "Input:", "- rows (int): The number of rows for the output matrix. Default is 3.", "- columns (int): The number of columns for the output matrix. Default is 2.", "- start_date (datetime): The start date for the range of unique dates. Default is datetime(2021, 1, 1).", "- end_date (datetime): The end date for the range of unique dates. Default is datetime(2021, 12, 31).", "Output to be returned:", "- ndarray: A numpy ndarray with unique dates in the shape (rows, columns)."], "note": [], "params": [], "returns": [], "reqs": ["numpy", "itertools", "datetime", "random"], "raises": [], "example": [">>> matrix = f_790(2, 2, datetime(2021, 1, 1), datetime(2021, 1, 10))", ">>> print(matrix)", "[['2021-01-03T00:00:00.000000000', '2021-01-07T00:00:00.000000000'],", "['2021-01-09T00:00:00.000000000', '2021-01-04T00:00:00.000000000']]"]}} +{"task_id": "f_608", "prompt": "import json\nimport csv\nimport os\nimport base64\n\ndef f_608(raw_string, filename, output_dir):\n \"\"\"\n Processes a base64-encoded JSON string, stores the data in a CSV file, and returns the path of the file.\n\n Parameters:\n - raw_string (str): The base64 encoded JSON string.\n - filename (str): The name of the file to which the data should be saved (without extension).\n - output_dir (str): The path of the directory in which the file should be saved.\n\n Returns:\n - file_path (str): The path of the file.\n\n Requirements:\n - json\n - csv\n - os\n - base64\n\n Example:\n >>> f_608('eyJrZXkiOiAiVmFsdWUifQ==', 'data', './output')\n './output/data.csv'\n \"\"\"", "canonical_solution": " # Decode the string and load the data\n decoded_string = base64.b64decode(raw_string).decode('utf-8')\n data = json.loads(decoded_string)\n\n # Prepare the output directory\n os.makedirs(output_dir, exist_ok=True)\n\n # Prepare the file path\n file_path = os.path.join(output_dir, f'{filename}.csv')\n\n # Save the data to the file\n with open(file_path, 'w', newline='') as f:\n writer = csv.writer(f)\n for key, value in data.items():\n writer.writerow([key, value])\n\n return file_path", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n if os.path.exists('./output'):\n shutil.rmtree('./output')\n \n def test_case_1(self):\n raw_string = 'eyJrZXkiOiAiVmFsdWUifQ=='\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,Value\\n')\n os.remove(expected)\n \n def test_case_2(self):\n string_before = \"\"\"{\"key\": \"hello\"}\"\"\"\n raw_string = base64.b64encode(string_before.encode('utf-8')).decode('utf-8')\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,hello\\n')\n os.remove(expected)\n def test_case_3(self):\n string_before = \"\"\"{\"key\": \"hello\", \"key2\": \"world\"}\"\"\"\n raw_string = base64.b64encode(string_before.encode('utf-8')).decode('utf-8')\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,hello\\nkey2,world\\n')\n os.remove(expected)\n def test_case_4(self):\n string_before = \"\"\"{\"key\": \"hello\", \"key2\": \"world\", \"key3\": \"!\"}\"\"\"\n raw_string = base64.b64encode(string_before.encode('utf-8')).decode('utf-8')\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,hello\\nkey2,world\\nkey3,!\\n')\n os.remove(expected)\n def test_case_5(self):\n string_before = \"\"\"{\"key\": \"hello\", \"key2\": \"world\", \"key3\": \"!\", \"key4\": \"test\"}\"\"\"\n raw_string = base64.b64encode(string_before.encode('utf-8')).decode('utf-8')\n filename = 'data'\n output_dir = './output'\n expected = './output/data.csv'\n self.assertEqual(f_608(raw_string, filename, output_dir), expected)\n with open(expected, 'r') as f:\n self.assertEqual(f.read(), 'key,hello\\nkey2,world\\nkey3,!\\nkey4,test\\n')\n os.remove(expected)", "apis": ["json.loads", "base64.b64decode", "os.makedirs", "os.path", "os.path.join", "csv.writer"], "libs": ["base64", "os", "json", "csv"], "doc": {"description": ["Processes a base64-encoded JSON string, stores the data in a CSV file, and returns the path of the file."], "note": [], "params": ["raw_string (str): The base64 encoded JSON string.", "filename (str): The name of the file to which the data should be saved (without extension).", "output_dir (str): The path of the directory in which the file should be saved."], "returns": ["file_path (str): The path of the file."], "reqs": ["json", "csv", "os", "base64"], "raises": [], "example": [">>> f_608('eyJrZXkiOiAiVmFsdWUifQ==', 'data', './output')", "'./output/data.csv'"]}} +{"task_id": "f_804", "prompt": "import os\nimport glob\nfrom collections import Counter\n\n\ndef f_804(directory, extensions=[\".txt\", \".docx\", \".xlsx\", \".csv\"], keep_zero=True):\n \"\"\"\n Traverses a given directory recursively to count files by specified extensions.\n\n Parameters:\n - directory (str): The path of the directory to search.\n - extensions (list of str): File extensions to count. Defaults to ['.txt', '.docx', '.xlsx', '.csv'].\n - keep_zero (bool): Whether to include extensions with zero counts. Defaults to True.\n\n Returns:\n - Counter: An object containing counts of files for each of the specified extensions.\n\n Raises:\n - OSError: If the specified directory does not exist.\n\n Requirements:\n - os\n - glob\n - collections\n\n Note:\n - This function counts files in a case-sensitive manner.\n\n Examples:\n >>> f_804('/path/to/documents')\n Counter({'.txt': 5, '.docx': 2, '.xlsx': 1, '.csv': 0})\n >>> f_804('/path/to/documents', keep_zero=False)\n Counter({'.txt': 5, '.docx': 2, '.xlsx': 1})\n >>> f_804('/path/to/documents', extensions=['.txt'], keep_zero=False)\n Counter({'.txt': 5})\n \"\"\"", "canonical_solution": " if not os.path.exists(directory):\n raise OSError(\"directory must exist.\")\n\n counter = Counter()\n\n for suffix in extensions:\n count = len(\n glob.glob(os.path.join(directory, \"**\", \"*\" + suffix), recursive=True)\n )\n if count:\n counter[suffix] += count\n else:\n if keep_zero:\n counter[suffix] += count\n return counter", "test": "import unittest\nfrom collections import Counter\nfrom tempfile import TemporaryDirectory\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = TemporaryDirectory()\n def tearDown(self):\n self.temp_dir.cleanup()\n def create_test_files(self, directory, file_list):\n for file_name in file_list:\n with open(os.path.join(directory, file_name), \"w\") as f:\n f.write(\"Test\")\n def test_case_1(self):\n # Test basic case with default extensions\n file_names = [\"file1.txt\", \"file2.docx\", \"file3.xlsx\", \"file4.csv\", \"file5.txt\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(self.temp_dir.name)\n expected = Counter({\".txt\": 2, \".docx\": 1, \".xlsx\": 1, \".csv\": 1})\n self.assertEqual(result, expected)\n def test_case_2(self):\n # Test empty directory\n result = f_804(self.temp_dir.name)\n expected = Counter({\".txt\": 0, \".docx\": 0, \".xlsx\": 0, \".csv\": 0})\n self.assertEqual(result, expected)\n def test_case_3(self):\n # Test error handling - non-existent directory\n with self.assertRaises(OSError):\n f_804(\"/path/to/nonexistent/directory\")\n def test_case_4(self):\n # Test ignoring unspecified extensions\n file_names = [\"file1.pdf\", \"file2.png\", \"file3.txt\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(self.temp_dir.name)\n expected = Counter({\".txt\": 1, \".docx\": 0, \".xlsx\": 0, \".csv\": 0})\n self.assertEqual(result, expected)\n def test_case_5(self):\n # Test nested folders\n nested_dir_path = os.path.join(self.temp_dir.name, \"nested\")\n os.makedirs(nested_dir_path)\n file_names = [\"nested_file1.txt\", \"nested_file2.xlsx\"]\n self.create_test_files(nested_dir_path, file_names)\n result = f_804(self.temp_dir.name)\n expected = Counter({\".txt\": 1, \".xlsx\": 1, \".docx\": 0, \".csv\": 0})\n self.assertEqual(result, expected)\n def test_case_6(self):\n # Test custom extensions\n file_names = [\"image.jpeg\", \"video.mp4\", \"document.pdf\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(\n self.temp_dir.name, extensions=[\".jpeg\", \".mp4\"], keep_zero=False\n )\n expected = Counter({\".jpeg\": 1, \".mp4\": 1})\n self.assertEqual(result, expected)\n def test_case_7(self):\n # Test custom extensions\n file_names = [\"file1.txt\", \"file2.docx\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(self.temp_dir.name, keep_zero=False)\n expected = Counter(\n {\".txt\": 1, \".docx\": 1}\n ) # .xlsx and .csv are omitted because their count is 0 and keep_zero is False\n self.assertEqual(result, expected)\n def test_case_8(self):\n # Test case sensitivity\n file_names = [\"file1.txt\", \"file1.tXt\", \"fiLE.txt\", \"fiLE.TXt\"]\n self.create_test_files(self.temp_dir.name, file_names)\n result = f_804(self.temp_dir.name, extensions=[\".txt\"])\n expected = Counter({\".txt\": 2})\n self.assertEqual(result, expected)", "apis": ["collections.Counter", "glob.glob", "os.path", "os.path.join", "os.path.exists"], "libs": ["os", "collections", "glob"], "doc": {"description": ["Traverses a given directory recursively to count files by specified extensions."], "note": ["This function counts files in a case-sensitive manner."], "params": ["directory (str): The path of the directory to search.", "extensions (list of str): File extensions to count. Defaults to ['.txt', '.docx', '.xlsx', '.csv'].", "keep_zero (bool): Whether to include extensions with zero counts. Defaults to True."], "returns": ["Counter: An object containing counts of files for each of the specified extensions."], "reqs": ["os", "glob", "collections"], "raises": ["OSError: If the specified directory does not exist."], "example": ["Examples:", ">>> f_804('/path/to/documents')", "Counter({'.txt': 5, '.docx': 2, '.xlsx': 1, '.csv': 0})", ">>> f_804('/path/to/documents', keep_zero=False)", "Counter({'.txt': 5, '.docx': 2, '.xlsx': 1})", ">>> f_804('/path/to/documents', extensions=['.txt'], keep_zero=False)", "Counter({'.txt': 5})"]}} +{"task_id": "f_560", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.cluster import KMeans\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_560(df):\n \"\"\"\n Given a pandas DataFrame with random numeric values, run KMeans clusters on the data and return the labels.\n\n Parameters:\n - df (DataFrame): The DataFrame to use.\n\n Returns:\n - labels (np.array): The labels from the KMeans clustering.\n\n Requirements:\n - pandas\n - numpy\n - sklearn\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.rand(500, 2) * 100, columns=['A', 'B']) \n >>> labels = f_560(df)\n >>> print(labels)\n [0 2 1 0 2 0 2 1 0 1 1 1 0 0 1 1 0 2 1 2 0 0 0 0 1 2 2 2 1 1 1 2 0 0 0 1 0\n 2 1 1 2 1 1 2 2 0 2 2 1 1 0 0 2 0 1 1 2 2 1 2 2 1 1 2 0 1 1 2 2 0 2 1 1 2\n 1 2 0 2 2 0 0 2 0 1 0 1 1 1 2 2 1 2 0 2 1 0 2 1 2 2 1 0 1 0 1 2 1 1 0 2 2\n 1 1 2 2 2 2 0 1 1 2 2 0 0 2 1 2 0 2 1 2 0 2 2 1 2 2 2 2 2 2 1 1 0 0 1 2 0\n 1 1 0 2 2 1 2 1 0 2 1 1 2 1 2 2 1 0 1 1 2 1 1 1 0 1 0 0 1 0 0 2 0 0 2 2 1\n 1 0 1 1 2 0 2 2 1 2 2 0 0 2 2 0 0 0 1 1 0 2 2 1 2 2 0 0 0 1 0 1 0 0 1 0 1\n 2 2 1 2 0 0 0 1 0 2 2 0 0 0 0 0 0 2 2 0 2 1 2 0 1 1 1 2 2 0 1 2 2 2 2 1 0\n 2 1 2 2 1 0 2 2 2 2 1 2 0 1 0 0 0 2 2 1 2 1 1 0 1 2 0 0 2 0 1 0 1 1 1 1 0\n 1 2 1 1 1 1 0 1 0 0 1 2 1 2 1 1 1 0 1 2 2 0 1 1 1 1 0 2 2 0 2 1 1 2 0 1 1\n 1 1 0 0 0 1 2 2 0 2 1 1 1 1 0 0 0 1 1 0 0 0 2 1 0 2 0 2 0 2 0 1 0 2 0 0 1\n 1 2 0 0 2 0 1 0 2 2 1 0 0 2 0 0 1 1 0 2 2 1 0 1 0 0 2 0 2 2 1 2 0 2 1 2 0\n 2 1 1 1 1 0 1 2 1 1 1 2 2 0 0 1 0 2 0 0 1 0 1 2 1 0 1 2 1 2 1 2 1 0 1 1 1\n 1 2 2 1 0 1 1 0 0 2 1 1 2 1 0 1 2 2 1 0 1 0 2 1 0 0 0 2 1 0 2 2 0 1 1 0 0\n 1 1 2 2 2 1 1 1 2 0 1 2 2 0 2 0 1 2 2]\n \"\"\"", "canonical_solution": " # Perform clustering\n scaler = StandardScaler()\n df_std = scaler.fit_transform(df.values)\n \n # Convert standardized values back to a DataFrame using pd\n df_std = pd.DataFrame(df_std, columns=df.columns)\n \n # Perform clustering with sklearn's KMeans\n kmeans = KMeans(n_clusters=3, random_state=0).fit(df_std)\n labels = kmeans.labels_ # The labels are directly a numpy array\n \n return labels", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.rand(500, 2) * 100, columns=['A', 'B'])\n labels = f_560(df)\n self.assertEqual(len(labels), 500)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))\n def test_case_2(self):\n df = pd.DataFrame(np.random.rand(10, 2) * 100, columns=['A', 'B'])\n labels = f_560(df)\n self.assertEqual(len(labels), 10)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))\n def test_case_3(self):\n df = pd.DataFrame(np.random.rand(5, 4) * 100, columns=['A', 'B', 'C', 'D'])\n labels = f_560(df)\n self.assertEqual(len(labels), 5)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))\n def test_case_4(self):\n df = pd.DataFrame(np.random.rand(20, 3) * 100, columns=['A', 'B', 'C'])\n labels = f_560(df)\n self.assertEqual(len(labels), 20)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))\n def test_case_5(self):\n df = pd.DataFrame(np.random.rand(42, 1) * 100, columns=['A'])\n labels = f_560(df)\n self.assertEqual(len(labels), 42)\n self.assertTrue(np.all(np.isin(labels, [0, 1, 2])))", "apis": ["pandas.DataFrame", "sklearn.preprocessing.StandardScaler", "sklearn.cluster.KMeans"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Given a pandas DataFrame with random numeric values, run KMeans clusters on the data and return the labels."], "note": [], "params": ["df (DataFrame): The DataFrame to use."], "returns": ["labels (np.array): The labels from the KMeans clustering."], "reqs": ["pandas", "numpy", "sklearn"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.rand(500, 2) * 100, columns=['A', 'B'])", ">>> labels = f_560(df)", ">>> print(labels)", "[0 2 1 0 2 0 2 1 0 1 1 1 0 0 1 1 0 2 1 2 0 0 0 0 1 2 2 2 1 1 1 2 0 0 0 1 0", "2 1 1 2 1 1 2 2 0 2 2 1 1 0 0 2 0 1 1 2 2 1 2 2 1 1 2 0 1 1 2 2 0 2 1 1 2", "1 2 0 2 2 0 0 2 0 1 0 1 1 1 2 2 1 2 0 2 1 0 2 1 2 2 1 0 1 0 1 2 1 1 0 2 2", "1 1 2 2 2 2 0 1 1 2 2 0 0 2 1 2 0 2 1 2 0 2 2 1 2 2 2 2 2 2 1 1 0 0 1 2 0", "1 1 0 2 2 1 2 1 0 2 1 1 2 1 2 2 1 0 1 1 2 1 1 1 0 1 0 0 1 0 0 2 0 0 2 2 1", "1 0 1 1 2 0 2 2 1 2 2 0 0 2 2 0 0 0 1 1 0 2 2 1 2 2 0 0 0 1 0 1 0 0 1 0 1", "2 2 1 2 0 0 0 1 0 2 2 0 0 0 0 0 0 2 2 0 2 1 2 0 1 1 1 2 2 0 1 2 2 2 2 1 0", "2 1 2 2 1 0 2 2 2 2 1 2 0 1 0 0 0 2 2 1 2 1 1 0 1 2 0 0 2 0 1 0 1 1 1 1 0", "1 2 1 1 1 1 0 1 0 0 1 2 1 2 1 1 1 0 1 2 2 0 1 1 1 1 0 2 2 0 2 1 1 2 0 1 1", "1 1 0 0 0 1 2 2 0 2 1 1 1 1 0 0 0 1 1 0 0 0 2 1 0 2 0 2 0 2 0 1 0 2 0 0 1", "1 2 0 0 2 0 1 0 2 2 1 0 0 2 0 0 1 1 0 2 2 1 0 1 0 0 2 0 2 2 1 2 0 2 1 2 0", "2 1 1 1 1 0 1 2 1 1 1 2 2 0 0 1 0 2 0 0 1 0 1 2 1 0 1 2 1 2 1 2 1 0 1 1 1", "1 2 2 1 0 1 1 0 0 2 1 1 2 1 0 1 2 2 1 0 1 0 2 1 0 0 0 2 1 0 2 2 0 1 1 0 0", "1 1 2 2 2 1 1 1 2 0 1 2 2 0 2 0 1 2 2]"]}} +{"task_id": "f_837", "prompt": "import requests\nimport pandas as pd\nfrom bs4 import BeautifulSoup\n\n\ndef f_837(url: str, csv_file_path: str) -> list:\n \"\"\"\n Extracts title, date, and author information from a webpage and writes the data to a CSV file.\n\n The function iterates through each 'div' element with a class 'container', extracting the text of 'h1', and 'span' elements with classes \n 'date' and 'author', respectively. Default values ('No Title', 'No Date', or 'No Author') are used if an element is \n not found. The extracted data is stored in a list of tuples.\n\n The list of tuples is then converted into a Pandas DataFrame and saved to a CSV file at the specified file path. \n The DataFrame's columns are labeled as 'Title', 'Date', and 'Author'. The function returns the list of tuples.\n\n Raises:\n - RuntimeError: If the URL is incorrect or the server is down, the error message might be \"Error fetching URL: HTTP Error 404: Not Found\" \n or \"Error fetching URL: ConnectionError\". The function begins by making an HTTP request to the specified URL. It sets a timeout of 5 seconds to avoid \n prolonged waiting in case of unresponsive webpages. If the request encounters any exceptions such as connection errors, timeouts, or HTTP errors, a 'requests.RequestException' is raised. \n The function raises a '' with a message that includes the details of the exception. For example,, depending on the specific issue encountered.\n Parameters:\n\n Parameters:\n - url (str): The URL of the webpage to be parsed.\n - csv_file_path (str): The path where the resulting CSV file will be saved.\n\n Returns:\n list: A list of tuples containing the (title, date, author) extracted from the webpage. Default placeholders \n are used for missing information.\n\n Requirements:\n - requests\n - bs4\n - pandas\n\n Example:\n >>> data = f_837('https://example.com/articles', '/path/to/save/csv/file.csv')\n >>> type(data)\n \n >>> len(data) > 0\n True\n \"\"\"", "canonical_solution": "\n\n try:\n response = requests.get(url, timeout=5)\n response.raise_for_status()\n except requests.RequestException as e:\n raise RuntimeError(f\"Error fetching URL: {e}\")\n\n soup = BeautifulSoup(response.text, \"html.parser\")\n data = []\n for div in soup.find_all(\"div\", class_=\"container\"):\n title = div.find(\"h1\").text.strip() if div.find(\"h1\") else \"No Title\"\n date = (\n div.find(\"span\", class_=\"date\").text.strip()\n if div.find(\"span\", class_=\"date\")\n else \"No Date\"\n )\n author = (\n div.find(\"span\", class_=\"author\").text.strip()\n if div.find(\"span\", class_=\"author\")\n else \"No Author\"\n )\n data.append((title, date, author))\n\n df = pd.DataFrame(data, columns=[\"Title\", \"Date\", \"Author\"])\n df.to_csv(csv_file_path, index=False)\n\n return data", "test": "import unittest\nfrom unittest.mock import patch\nimport os\nimport shutil\n# Mock HTML content\ntest_data_1_html = \"\"\"\n\n
\n

Title1

\n Date1\n Author1\n
\n
\n

Title2

\n Date2\n Author2\n
\n\n\"\"\"\ntest_data_2_html = \"\"\"\n\n
\n

TitleA

\n DateA\n AuthorA\n
\n\n\"\"\"\nclass MockResponse:\n \"\"\"Mock class for requests.Response\"\"\"\n def __init__(self, text, status_code):\n self.text = text\n self.status_code = status_code\n def raise_for_status(self):\n if self.status_code != 200:\n raise Exception(\"HTTP Error\")\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the f_837 function\"\"\"\n @classmethod\n def setUpClass(cls):\n \"\"\"Set up any necessary resources before any tests are run.\"\"\"\n os.makedirs(\"mnt/data\", exist_ok=True) # Create the directory for test files\n @patch(\"requests.get\")\n def test_html_parsing_multiple_entries(self, mock_get):\n \"\"\"Test parsing of HTML with multiple data entries.\"\"\"\n mock_get.return_value = MockResponse(test_data_1_html, 200)\n url = \"https://example.com/test_data_1.html\"\n csv_file_path = \"mnt/data/output_1.csv\"\n expected_output = [\n (\"Title1\", \"Date1\", \"Author1\"),\n (\"Title2\", \"Date2\", \"Author2\"),\n ]\n self.assertEqual(f_837(url, csv_file_path), expected_output)\n @patch(\"requests.get\")\n def test_html_parsing_single_entry(self, mock_get):\n \"\"\"Test parsing of HTML with a single data entry.\"\"\"\n mock_get.return_value = MockResponse(test_data_2_html, 200)\n url = \"https://example.com/test_data_2.html\"\n csv_file_path = \"mnt/data/output_2.csv\"\n expected_output = [(\"TitleA\", \"DateA\", \"AuthorA\")]\n self.assertEqual(f_837(url, csv_file_path), expected_output)\n @patch(\"requests.get\")\n def test_html_parsing_with_same_data_as_first(self, mock_get):\n \"\"\"Test parsing of HTML similar to first test case.\"\"\"\n mock_get.return_value = MockResponse(test_data_1_html, 200)\n url = \"https://example.com/test_data_1.html\"\n csv_file_path = \"mnt/data/output_3.csv\"\n expected_output = [\n (\"Title1\", \"Date1\", \"Author1\"),\n (\"Title2\", \"Date2\", \"Author2\"),\n ]\n self.assertEqual(f_837(url, csv_file_path), expected_output)\n @patch(\"requests.get\")\n def test_html_parsing_with_same_data_as_second(self, mock_get):\n \"\"\"Test parsing of HTML similar to second test case.\"\"\"\n mock_get.return_value = MockResponse(test_data_2_html, 200)\n url = \"https://example.com/test_data_2.html\"\n csv_file_path = \"mnt/data/output_4.csv\"\n expected_output = [(\"TitleA\", \"DateA\", \"AuthorA\")]\n self.assertEqual(f_837(url, csv_file_path), expected_output)\n @patch(\"requests.get\")\n def test_html_parsing_with_nonexistent_url(self, mock_get):\n \"\"\"Test handling of HTTP error when URL does not exist.\"\"\"\n mock_get.return_value = MockResponse(\"\", 404) # Simulating a 404 error\n url = \"https://example.com/non_existent.html\" # Non-existent URL\n csv_file_path = \"mnt/data/output_5.csv\"\n with self.assertRaises(Exception):\n f_837(url, csv_file_path) # Should raise HTTP Error\n @patch(\"requests.get\")\n def test_f_837_request_exception(self, mock_get):\n \"\"\"Test f_837 raises an exception when there is a request error.\"\"\"\n mock_get.side_effect = requests.RequestException(\"Error fetching URL\")\n url = \"https://example.com/non_existent.html\"\n csv_file_path = \"mnt/data/output_error.csv\"\n with self.assertRaises(Exception) as context:\n f_837(url, csv_file_path)\n self.assertIn(\"Error fetching URL\", str(context.exception))\n @classmethod\n def tearDownClass(cls):\n \"\"\"Clean up shared resources after all tests in the class have completed.\"\"\"\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["pandas.DataFrame", "requests.RequestException", "bs4.BeautifulSoup", "requests.get"], "libs": ["bs4", "pandas", "requests"], "doc": {"description": ["Extracts title, date, and author information from a webpage and writes the data to a CSV file.", "The function iterates through each 'div' element with a class 'container', extracting the text of 'h1', and 'span' elements with classes", "'date' and 'author', respectively. Default values ('No Title', 'No Date', or 'No Author') are used if an element is", "not found. The extracted data is stored in a list of tuples.", "The list of tuples is then converted into a Pandas DataFrame and saved to a CSV file at the specified file path.", "The DataFrame's columns are labeled as 'Title', 'Date', and 'Author'. The function returns the list of tuples."], "note": [], "params": ["url (str): The URL of the webpage to be parsed.", "csv_file_path (str): The path where the resulting CSV file will be saved."], "returns": ["list: A list of tuples containing the (title, date, author) extracted from the webpage. Default placeholders", "are used for missing information."], "reqs": ["requests", "bs4", "pandas"], "raises": ["RuntimeError: If the URL is incorrect or the server is down, the error message might be \"Error fetching URL: HTTP Error 404: Not Found\"", "or \"Error fetching URL: ConnectionError\". The function begins by making an HTTP request to the specified URL. It sets a timeout of 5 seconds to avoid", "prolonged waiting in case of unresponsive webpages. If the request encounters any exceptions such as connection errors, timeouts, or HTTP errors, a 'requests.RequestException' is raised.", "The function raises a '' with a message that includes the details of the exception. For example,, depending on the specific issue encountered."], "example": [">>> data = f_837('https://example.com/articles', '/path/to/save/csv/file.csv')", ">>> type(data)", "", ">>> len(data) > 0", "True"]}} +{"task_id": "f_740", "prompt": "from collections import Counter\nimport random\n\nLETTERS = ['a', 'b', 'c', 'd', 'e']\n\ndef f_740(count, seed=0):\n \"\"\"\n Generate a specific number of random letter pairs, each from a predefined list, and analyze the frequency of each pair.\n\n Parameters:\n - count (int): The number of letter pairs to generate.\n - seed (int, optional): Seed for the random number generator to ensure reproducibility. Defaults to None.\n\n Returns:\n - Counter: A Counter object representing the frequency of each generated letter pair.\n\n Requirements:\n - collections.Counter\n - random\n\n Examples:\n >>> f_740(5, seed=42)\n Counter({('d', 'a'): 1, ('b', 'b'): 1, ('d', 'd'): 1, ('e', 'a'): 1, ('c', 'a'): 1})\n >>> f_740(0, seed=42)\n Counter()\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n pairs = [tuple(random.choices(LETTERS, k=2)) for _ in range(count)]\n pair_frequency = Counter(pairs)\n\n return pair_frequency", "test": "import unittest\nfrom collections import Counter\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Initialize random seed for reproducibility in tests\n random.seed(42)\n def test_case_1(self):\n # Test with count = 5\n result = f_740(5, seed=42)\n self.assertIsInstance(result, Counter)\n self.assertEqual(result, Counter({('d', 'a'): 1, ('b', 'b'): 1, ('d', 'd'): 1, ('e', 'a'): 1, ('c', 'a'): 1}))\n def test_case_2(self):\n # Test with count = 0 (no pairs)\n result = f_740(0, seed=4)\n self.assertEqual(result, Counter())\n def test_case_3(self):\n # Test with count = 100 (larger number)\n result = f_740(100, seed=2)\n self.assertEqual(sum(result.values()), 100)\n def test_case_4(self):\n # Test with count = 10 and check if all pairs have letters from the defined LETTERS\n result = f_740(10, seed=0)\n self.assertEqual(result, Counter({('c', 'c'): 2, ('d', 'b'): 2, ('e', 'e'): 2, ('e', 'd'): 1, ('c', 'b'): 1, ('e', 'c'): 1, ('b', 'd'): 1}))\n def test_case_5(self):\n # Test with count = 5 and check if the total counts match the input count\n result = f_740(5, seed=1)\n self.assertEqual(result, Counter({('a', 'e'): 1, ('d', 'b'): 1, ('c', 'c'): 1, ('d', 'd'): 1, ('a', 'a'): 1}))", "apis": ["collections.Counter", "random.seed", "random.choices"], "libs": ["collections", "random"], "doc": {"description": ["Generate a specific number of random letter pairs, each from a predefined list, and analyze the frequency of each pair."], "note": [], "params": ["count (int): The number of letter pairs to generate.", "seed (int, optional): Seed for the random number generator to ensure reproducibility. Defaults to None."], "returns": ["Counter: A Counter object representing the frequency of each generated letter pair."], "reqs": ["collections.Counter", "random"], "raises": [], "example": ["Examples:", ">>> f_740(5, seed=42)", "Counter({('d', 'a'): 1, ('b', 'b'): 1, ('d', 'd'): 1, ('e', 'a'): 1, ('c', 'a'): 1})", ">>> f_740(0, seed=42)", "Counter()"]}} +{"task_id": "f_869", "prompt": "import numpy as np\nfrom scipy.stats import ttest_ind\nimport matplotlib.pyplot as plt\n\n\ndef f_869(kwargs):\n \"\"\"\n Performs a two-sample t-test on numerical data from two groups to determine if there is a significant\n difference in their means. The function handles NaN values, computes descriptive statistics for each group,\n and generates a boxplot and histograms for data visualization.\n\n Parameters:\n - kwargs (dict): A dictionary with two keys, 'group1' and 'group2'. Each key maps to a list of numbers.\n Lists can contain NaN values, which will be excluded from analysis.\n\n Returns:\n - dict: A dictionary containing:\n - 'significant': Boolean. True if the means of the two groups are significantly different (p < 0.05).\n - 'group1_stats': Dictionary with mean and standard deviation of 'group1' (excluding NaNs).\n - 'group2_stats': Dictionary with mean and standard deviation of 'group2' (excluding NaNs).\n - 'ax_boxplot': A matplotlib Axes object with a boxplot comparing 'group1' and 'group2'.\n - 'ax_histogram': A matplotlib Axes object with histograms of 'group1' and 'group2'.\n\n Raises:\n - ValueError: If either group is empty, contains only NaN values, has less than two non-NaN values,\n or if the variance in one or both groups is below a threshold (1e-8).\n\n Requirements:\n - numpy\n - scipy\n - matplotlib\n\n Note:\n - The function sets the significance level (alpha) at 0.05.\n - It removes NaN values before performing any calculations or plotting.\n - A t-test is performed with the 'nan_policy' set to 'omit' to ignore NaNs.\n - The function checks for sufficient non-NaN data points and adequate variance in each group before conducting the t-test.\n - The boxplot and histograms provide a visual comparison of the data distributions.\n \n Example:\n >>> data = {'group1': [1, 2, 3, 4], 'group2': [5, 6, 7, 8]}\n >>> results = f_869(data)\n >>> results['significant']\n True\n \"\"\"", "canonical_solution": " alpha = 0.05 # Define the significance level\n\n group1 = np.array(kwargs.get(\"group1\", []))\n group2 = np.array(kwargs.get(\"group2\", []))\n\n # Check for empty or all-NaN groups\n if (\n len(group1) == 0\n or len(group2) == 0\n or np.all(np.isnan(group1))\n or np.all(np.isnan(group2))\n ):\n raise ValueError(\"One or both groups are empty or contain only NaN values.\")\n\n # Removing NaN values and ensuring sufficient data\n valid_group1 = group1[~np.isnan(group1)]\n valid_group2 = group2[~np.isnan(group2)]\n\n # Check for sufficient size and variance\n if len(valid_group1) < 2 or len(valid_group2) < 2:\n raise ValueError(\"Each group must have at least two non-NaN values.\")\n\n if np.var(valid_group1) < 1e-8 or np.var(valid_group2) < 1e-8:\n raise ValueError(\"Variance in one or both groups is too low.\")\n\n # Perform t-test\n _, p_val = ttest_ind(valid_group1, valid_group2, nan_policy=\"omit\")\n\n significant = p_val < alpha\n\n # Calculate descriptive statistics\n group1_stats = {\"mean\": np.mean(valid_group1), \"std\": np.std(valid_group1)}\n group2_stats = {\"mean\": np.mean(valid_group2), \"std\": np.std(valid_group2)}\n\n # Plotting\n _, (ax_boxplot, ax_histogram) = plt.subplots(2, 1, figsize=(8, 12))\n\n # Boxplot\n ax_boxplot.boxplot([valid_group1, valid_group2], labels=[\"group1\", \"group2\"])\n\n # Histogram\n ax_histogram.hist(valid_group1, alpha=0.5, label=\"group1\")\n ax_histogram.hist(valid_group2, alpha=0.5, label=\"group2\")\n ax_histogram.legend()\n\n return {\n \"significant\": significant,\n \"group1_stats\": group1_stats,\n \"group2_stats\": group2_stats,\n \"ax_boxplot\": ax_boxplot,\n \"ax_histogram\": ax_histogram,\n }", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n def test_different_means(self):\n \"\"\"Test with groups having significantly different means.\"\"\"\n data = {\"group1\": [1, 2, 3], \"group2\": [4, 5, 6]}\n result = f_869(data)\n self.assertTrue(result[\"significant\"])\n def test_similar_means(self):\n \"\"\"Test with groups having similar means.\"\"\"\n data = {\"group1\": [1, 2, 3], \"group2\": [1, 2, 3]}\n result = f_869(data)\n self.assertFalse(result[\"significant\"])\n def test_with_nan_values(self):\n \"\"\"Test with groups containing NaN values but with at least two non-NaN values in each group.\"\"\"\n data = {\"group1\": [np.nan, 2, 3], \"group2\": [1, np.nan, 3]}\n result = f_869(data)\n self.assertIsNotNone(result)\n def test_empty_group(self):\n \"\"\"Test with one of the groups being empty.\"\"\"\n data = {\"group1\": [], \"group2\": [1, 2, 3]}\n with self.assertRaises(ValueError):\n f_869(data)\n def test_all_nan_values(self):\n \"\"\"Test with groups containing only NaN values.\"\"\"\n data = {\"group1\": [np.nan, np.nan], \"group2\": [np.nan, np.nan]}\n with self.assertRaises(ValueError):\n f_869(data)\n def test_insufficient_group_size(self):\n \"\"\"Test with one of the groups having less than two non-NaN values.\"\"\"\n data = {\"group1\": [1, np.nan], \"group2\": [2, 3, 4]}\n with self.assertRaises(ValueError):\n f_869(data)\n def test_low_variance(self):\n \"\"\"Test with one of the groups having extremely low variance.\"\"\"\n data = {\"group1\": [1.00000001, 1.00000002], \"group2\": [2, 3, 4]}\n with self.assertRaises(ValueError):\n f_869(data)", "apis": ["numpy.var", "scipy.stats.ttest_ind", "numpy.array", "numpy.all", "numpy.mean", "numpy.isnan", "matplotlib.pyplot.subplots", "numpy.std"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Performs a two-sample t-test on numerical data from two groups to determine if there is a significant", "difference in their means. The function handles NaN values, computes descriptive statistics for each group,", "and generates a boxplot and histograms for data visualization."], "note": ["The function sets the significance level (alpha) at 0.05.", "It removes NaN values before performing any calculations or plotting.", "A t-test is performed with the 'nan_policy' set to 'omit' to ignore NaNs.", "The function checks for sufficient non-NaN data points and adequate variance in each group before conducting the t-test.", "The boxplot and histograms provide a visual comparison of the data distributions."], "params": ["kwargs (dict): A dictionary with two keys, 'group1' and 'group2'. Each key maps to a list of numbers.", "Lists can contain NaN values, which will be excluded from analysis."], "returns": ["dict: A dictionary containing:", "'significant': Boolean. True if the means of the two groups are significantly different (p < 0.05).", "'group1_stats': Dictionary with mean and standard deviation of 'group1' (excluding NaNs).", "'group2_stats': Dictionary with mean and standard deviation of 'group2' (excluding NaNs).", "'ax_boxplot': A matplotlib Axes object with a boxplot comparing 'group1' and 'group2'.", "'ax_histogram': A matplotlib Axes object with histograms of 'group1' and 'group2'."], "reqs": ["numpy", "scipy", "matplotlib"], "raises": ["ValueError: If either group is empty, contains only NaN values, has less than two non-NaN values,", "or if the variance in one or both groups is below a threshold (1e-8)."], "example": [">>> data = {'group1': [1, 2, 3, 4], 'group2': [5, 6, 7, 8]}", ">>> results = f_869(data)", ">>> results['significant']", "True"]}} +{"task_id": "f_580", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LinearRegression\n\ndef f_580(df):\n \"\"\"\n Use a linear regression model to predict the \"value\" of \"feature\" in the given dataframe and return the coefficients and intercept.\n\n Parameters:\n - df (pd.DataFrame): pandas DataFrame that contains columns named 'feature' and 'value'.\n\n Returns:\n - result (dict): A dictionary with the coefficients and the intercept of the fitted linear regression model.\n\n Requirements:\n - pandas\n - numpy\n - sklearn\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame({'feature': np.random.rand(100), 'value': np.random.rand(100)})\n >>> coefficients = f_580(df)\n >>> print(coefficients)\n {'coefficients': [[-0.03353164387961974]], 'intercept': [0.5135976564010359]}\n \"\"\"", "canonical_solution": " X = np.array(df['feature']).reshape(-1,1) # Explicitly converting to numpy array and reshaping\n y = np.array(df['value']).reshape(-1,1) # Explicitly converting to numpy array and reshaping\n\n model = LinearRegression().fit(X, y)\n\n return {'coefficients': model.coef_.tolist(), 'intercept': model.intercept_.tolist()}", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'feature': np.random.rand(100), 'value': np.random.rand(100)})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n def test_case_2(self):\n df = pd.DataFrame({'feature': [1, 2, 3, 4, 5], 'value': [1, 2, 3, 4, 5]})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n self.assertAlmostEqual(coefficients['coefficients'][0][0], 1.0)\n self.assertAlmostEqual(coefficients['intercept'][0], 0.0)\n def test_case_3(self):\n df = pd.DataFrame({'feature': [1, 2, 3, 4, 5], 'value': [2, 4, 6, 8, 10]})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n self.assertAlmostEqual(coefficients['coefficients'][0][0], 2.0)\n self.assertAlmostEqual(coefficients['intercept'][0], 0.0)\n def test_case_4(self):\n df = pd.DataFrame({'feature': [0, 0, 0, 0, 0], 'value': [1, 2, 3, 4, 5]})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n self.assertAlmostEqual(coefficients['coefficients'][0][0], 0.0)\n self.assertAlmostEqual(coefficients['intercept'][0], 3.0)\n def test_case_5(self):\n df = pd.DataFrame({'feature': [1, 2, 3, 4, 5], 'value': [0, 0, 0, 0, 0]})\n coefficients = f_580(df)\n self.assertEqual(len(coefficients['coefficients']), 1)\n self.assertEqual(len(coefficients['coefficients'][0]), 1)\n self.assertEqual(len(coefficients['intercept']), 1)\n self.assertAlmostEqual(coefficients['coefficients'][0][0], 0.0)\n self.assertAlmostEqual(coefficients['intercept'][0], 0.0)", "apis": ["sklearn.linear_model.LinearRegression", "numpy.array"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Use a linear regression model to predict the \"value\" of \"feature\" in the given dataframe and return the coefficients and intercept."], "note": [], "params": ["df (pd.DataFrame): pandas DataFrame that contains columns named 'feature' and 'value'."], "returns": ["result (dict): A dictionary with the coefficients and the intercept of the fitted linear regression model."], "reqs": ["pandas", "numpy", "sklearn"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame({'feature': np.random.rand(100), 'value': np.random.rand(100)})", ">>> coefficients = f_580(df)", ">>> print(coefficients)", "{'coefficients': [[-0.03353164387961974]], 'intercept': [0.5135976564010359]}"]}} +{"task_id": "f_528", "prompt": "import heapq\nimport collections\n\ndef f_528(x, n):\n \"\"\"\n Find the n most common letters in a dictionary, x, where the key letters and the values are their frequencies.\n\n Parameters:\n - x (dict): The dictionary of letter frequencies.\n - n (int): The number of most frequent letters to return.\n\n Returns:\n - list: The n most frequent letters.\n\n Requirements:\n - heapq\n - collections\n\n Example:\n >>> f_528({'a': 1, 'b': 2, 'c': 3}, 2)\n ['c', 'b']\n \"\"\"", "canonical_solution": " counter = collections.Counter(x)\n most_frequent = heapq.nlargest(n, counter.keys(), key=counter.get)\n\n return most_frequent", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 2), ['c', 'b'])\n def test_case_2(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 1), ['c'])\n def test_case_3(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 3), ['c', 'b', 'a'])\n def test_case_4(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 0), [])\n def test_case_5(self):\n self.assertEqual(f_528({'a': 1, 'b': 2, 'c': 3}, 4), ['c', 'b', 'a'])", "apis": ["heapq.nlargest", "collections.Counter"], "libs": ["collections", "heapq"], "doc": {"description": ["Find the n most common letters in a dictionary, x, where the key letters and the values are their frequencies."], "note": [], "params": ["x (dict): The dictionary of letter frequencies.", "n (int): The number of most frequent letters to return."], "returns": ["list: The n most frequent letters."], "reqs": ["heapq", "collections"], "raises": [], "example": [">>> f_528({'a': 1, 'b': 2, 'c': 3}, 2)", "['c', 'b']"]}} +{"task_id": "f_413", "prompt": "import json\nimport numpy as np\nfrom collections import defaultdict\nimport matplotlib.pyplot as plt\n\n\ndef f_413(input_file):\n \"\"\"\n Reads a JSON file containing a list of dictionaries. For each key across all dictionaries,\n calculates the mean and median of its values using numpy. Visualizes the mean and median\n using bar charts. Returns the results and plots.\n\n Parameters:\n - input_file (str): Path to the input JSON file containing a list of dictionaries.\n\n Returns:\n - result (dict): each key corresponds to those in the input dictionaries, and the corresponding\n value is another dict with keys 'mean' and 'median', representing the calculated statistics.\n - plots (list[matplotlib.axes._subplots.Axes]): A list of bar charts, one for\n each key in the dictionaries, visualizing the mean and median values.\n\n Requirements:\n - json\n - numpy\n - collections.defaultdict\n - matplotlib.pyplot\n\n Example:\n >>> results, plots = f_413(\"sample_data.json\")\n >>> type(plots[0])\n \n >>> results\n {'a': {'mean': 3.0, 'median': 3.0}, 'b': {'mean': 6.0, 'median': 6.0}}\n \"\"\"", "canonical_solution": " with open(input_file, \"r\") as f:\n data = json.load(f)\n\n stats = defaultdict(list)\n for d in data:\n for key, value in d.items():\n stats[key].append(value)\n\n result = {k: {\"mean\": np.mean(v), \"median\": np.median(v)} for k, v in stats.items()}\n\n plots = []\n for key, values in result.items():\n _, ax = plt.subplots()\n ax.bar([\"mean\", \"median\"], [values[\"mean\"], values[\"median\"]])\n ax.set_title(f\"Statistics of {key}\")\n plots.append(ax)\n return result, plots", "test": "import matplotlib\nimport unittest\nimport tempfile\nimport os\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.temp_dir = tempfile.TemporaryDirectory()\n cls.test_data = {\n \"test_1.json\": [{\"a\": 2, \"b\": 4}, {\"a\": 4, \"b\": 8}],\n \"test_2.json\": [{\"x\": 1}, {\"y\": 2}, {\"z\": 6}],\n \"invalid.json\": {\"not\": \"valid\"},\n \"empty.json\": [],\n }\n # Generate test files\n for filename, content in cls.test_data.items():\n with open(os.path.join(cls.temp_dir.name, filename), \"w\") as f:\n json.dump(content, f)\n @classmethod\n def tearDownClass(cls):\n cls.temp_dir.cleanup()\n plt.close(\"all\")\n def test_case_1(self):\n # Check plot generation\n expected_titles = [\"a\", \"b\"]\n _, plots = f_413(os.path.join(self.temp_dir.name, \"test_1.json\"))\n self.assertEqual(len(plots), len(expected_titles))\n for plot, title in zip(plots, expected_titles):\n assert isinstance(plot, matplotlib.axes._axes.Axes)\n self.assertTrue(plot.get_title(), f\"Statistics of {title}\")\n def test_case_2(self):\n # Check result correctness\n results, _ = f_413(os.path.join(self.temp_dir.name, \"test_1.json\"))\n self.assertIn(\"a\", results)\n self.assertIn(\"b\", results)\n self.assertEqual(results[\"a\"][\"mean\"], 3.0)\n self.assertEqual(results[\"a\"][\"median\"], 3.0)\n self.assertEqual(results[\"b\"][\"mean\"], 6.0)\n self.assertEqual(results[\"b\"][\"median\"], 6.0)\n def test_case_3(self):\n # Test with invalid data structure (not a list of dicts)\n with self.assertRaises(AttributeError):\n f_413(os.path.join(self.temp_dir.name, \"invalid.json\"))\n def test_case_4(self):\n # Test with empty data\n results, plots = f_413(os.path.join(self.temp_dir.name, \"empty.json\"))\n self.assertEqual(results, {})\n self.assertEqual(len(plots), 0)\n def test_case_5(self):\n # Test handling nested dicts with one key each\n results, _ = f_413(os.path.join(self.temp_dir.name, \"test_2.json\"))\n self.assertIn(\"x\", results)\n self.assertIn(\"y\", results)\n self.assertIn(\"z\", results)\n self.assertEqual(results[\"x\"][\"mean\"], 1.0)\n self.assertEqual(results[\"x\"][\"median\"], 1.0)\n self.assertEqual(results[\"y\"][\"mean\"], 2.0)\n self.assertEqual(results[\"y\"][\"median\"], 2.0)\n self.assertEqual(results[\"z\"][\"mean\"], 6.0)\n self.assertEqual(results[\"z\"][\"median\"], 6.0)\n def test_case_6(self):\n # Test with nonexistent filename\n with self.assertRaises(FileNotFoundError):\n f_413(os.path.join(self.temp_dir.name, \"NOTEXISTS.json\"))", "apis": ["collections.defaultdict", "numpy.mean", "matplotlib.pyplot.subplots", "numpy.median", "json.load"], "libs": ["collections", "json", "numpy", "matplotlib"], "doc": {"description": ["Reads a JSON file containing a list of dictionaries. For each key across all dictionaries,", "calculates the mean and median of its values using numpy. Visualizes the mean and median", "using bar charts. Returns the results and plots."], "note": [], "params": ["input_file (str): Path to the input JSON file containing a list of dictionaries."], "returns": ["result (dict): each key corresponds to those in the input dictionaries, and the corresponding", "value is another dict with keys 'mean' and 'median', representing the calculated statistics.", "plots (list[matplotlib.axes._subplots.Axes]): A list of bar charts, one for", "each key in the dictionaries, visualizing the mean and median values."], "reqs": ["json", "numpy", "collections.defaultdict", "matplotlib.pyplot"], "raises": [], "example": [">>> results, plots = f_413(\"sample_data.json\")", ">>> type(plots[0])", "", ">>> results", "{'a': {'mean': 3.0, 'median': 3.0}, 'b': {'mean': 6.0, 'median': 6.0}}"]}} +{"task_id": "f_917", "prompt": "import time\nimport matplotlib.pyplot as plt\n\n\ndef f_917(time_strings, time_format=\"%d/%m/%Y %H:%M:%S.%f\"):\n \"\"\"\n Parses a list of time strings and plots a histogram of the seconds component.\n\n Parameters:\n - time_strings (list of str): A list of time strings to be parsed. Each string in the list should\n be formatted according to the 'time_format' parameter.\n - time_format (str): The format string for parsing the time strings in 'time_strings'.\n The default format is '%d/%m/%Y %H:%M:%S.%f', representing day/month/year hours:minutes:seconds.microseconds.\n\n Returns:\n - ax (matplotlib.axes._subplots.Axes or None): An Axes object with the histogram plotted if\n parsing is successful. Returns None if a parsing error occurs.\n\n Requirements:\n - time\n - matplotlib\n \n Raises:\n - ValueError: If any time string in 'time_strings' cannot be parsed according to 'time_format'.\n\n Example:\n >>> time_strings = ['30/03/2009 16:31:32.123', '15/04/2010 14:25:46.789', '20/12/2011 12:34:56.000']\n >>> ax = f_917(time_strings)\n >>> plt.show() # Display the plot\n \"\"\"", "canonical_solution": " try:\n seconds = [time.strptime(ts, time_format).tm_sec for ts in time_strings]\n _, ax = plt.subplots()\n ax.hist(seconds, bins=60, rwidth=0.8)\n return ax\n except ValueError as e:\n print(f\"Error parsing time strings: {e}\")\n return None", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_917.\"\"\"\n def test_histogram_counts(self):\n \"\"\"Test the counts in the histogram.\"\"\"\n time_strings = [\n \"30/03/2009 16:31:32.123\",\n \"15/04/2010 14:25:46.789\",\n \"20/12/2011 12:34:56.000\",\n ]\n ax = f_917(time_strings)\n # Extract histogram data\n n_values = [patch.get_height() for patch in ax.patches]\n # Check the count of values in each bin\n self.assertTrue(1 in n_values)\n def test_histogram_title(self):\n \"\"\"Test the title of the histogram.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"]\n ax = f_917(time_strings)\n self.assertEqual(ax.get_title(), \"\")\n def test_histogram_xaxis(self):\n \"\"\"Test the x-axis label of the histogram.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"]\n ax = f_917(time_strings)\n self.assertEqual(ax.get_xlabel(), \"\")\n def test_histogram_yaxis(self):\n \"\"\"Test the y-axis label of the histogram.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"]\n ax = f_917(time_strings)\n self.assertEqual(ax.get_ylabel(), \"\")\n def test_large_input(self):\n \"\"\"Test with a large input.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"] * 50\n ax = f_917(time_strings)\n # Extract histogram data\n n_values = [patch.get_height() for patch in ax.patches]\n # Check the count of values in the specific bin corresponding to the seconds value \"32\"\n self.assertTrue(50 in n_values)\n def test_invalid_time_format(self):\n \"\"\"Test with an invalid time format.\"\"\"\n time_strings = [\"30/03/2009 16:31:32.123\"]\n ax = f_917(time_strings, time_format=\"%d/%m/%Y %H:%M:%S\")\n self.assertIsNone(ax)\n def tearDown(self):\n plt.close()", "apis": ["matplotlib.pyplot.subplots", "time.strptime"], "libs": ["time", "matplotlib"], "doc": {"description": ["Parses a list of time strings and plots a histogram of the seconds component."], "note": [], "params": ["time_strings (list of str): A list of time strings to be parsed. Each string in the list should", "be formatted according to the 'time_format' parameter.", "time_format (str): The format string for parsing the time strings in 'time_strings'.", "The default format is '%d/%m/%Y %H:%M:%S.%f', representing day/month/year hours:minutes:seconds.microseconds."], "returns": ["ax (matplotlib.axes._subplots.Axes or None): An Axes object with the histogram plotted if", "parsing is successful. Returns None if a parsing error occurs."], "reqs": ["time", "matplotlib"], "raises": ["ValueError: If any time string in 'time_strings' cannot be parsed according to 'time_format'."], "example": [">>> time_strings = ['30/03/2009 16:31:32.123', '15/04/2010 14:25:46.789', '20/12/2011 12:34:56.000']", ">>> ax = f_917(time_strings)", ">>> plt.show() # Display the plot"]}} +{"task_id": "f_542", "prompt": "import pandas as pd\nimport json\n\n\ndef f_542(file_path, key):\n \"\"\"\n Load a JSON file into a Pandas DataFrame, remove a specific key from each object and write the processed DataFrame back into a JSON file.\n \n Parameters:\n - file_path (str): The path to the JSON file.\n - key (str): The key to remove from each object.\n \n Returns:\n - df (DataFrame): A pandas DataFrame representation of the processed JSON data.\n\n Requirements:\n - pandas\n - json\n \n Example:\n >>> df = f_542('data.json', 'ele')\n \"\"\"", "canonical_solution": " with open(file_path, 'r') as file:\n data = json.load(file)\n\n df = pd.DataFrame(data)\n df.drop(key, axis=1, inplace=True)\n\n with open(file_path, 'w') as file:\n file.write(df.to_json(orient='records'))\n\n return df", "test": "import unittest\nimport os\nclass TestCases(unittest.TestCase):\n def base(self, json_path, key, contents):\n # Create JSON file\n with open(json_path, 'w') as file:\n json.dump(contents, file)\n # Run function\n df = f_542(json_path, key)\n # Check key is removed\n self.assertFalse(key in df.columns)\n # Check JSON file is updated\n with open(json_path, 'r') as file:\n data = json.load(file)\n self.assertFalse(key in data[0])\n # Remove JSON file\n os.remove(json_path)\n def test_case_1(self):\n self.base('data.json', 'ele', [{'ele': 1, 'a': 2}, {'ele': 3, 'a': 4}])\n def test_case_2(self):\n self.base('data.json', 'ele', [{'ele': 1, 'a': 2}, {'ele': 3, 'a': 4}, {'ele': 5, 'a': 6}])\n def test_case_3(self):\n self.base('x.json', 'zzz', [{'zzz': 1, 'a': 2}, {'zzz': 3, 'a': 4}])\n def test_case_4(self):\n self.base('g.json', 'ele', [{'ele': 1, 'a': 2}, {'ele': 3, 'a': 4}])\n def test_case_5(self):\n self.base('data.json', 'ele', [{'ele': 1, 'a': 2}, {'ele': 3, 'a': 4}])", "apis": ["pandas.DataFrame", "json.load"], "libs": ["json", "pandas"], "doc": {"description": ["Load a JSON file into a Pandas DataFrame, remove a specific key from each object and write the processed DataFrame back into a JSON file."], "note": [], "params": ["file_path (str): The path to the JSON file.", "key (str): The key to remove from each object."], "returns": ["df (DataFrame): A pandas DataFrame representation of the processed JSON data."], "reqs": ["pandas", "json"], "raises": [], "example": [">>> df = f_542('data.json', 'ele')"]}} +{"task_id": "f_847", "prompt": "import urllib.request\nimport re\nfrom collections import Counter\nimport matplotlib.pyplot as plt\n\n\ndef f_847(url):\n \"\"\"\n Downloads a text file from a specified URL, processes the text to count the frequency of each word,\n and then plots a bar chart showing the ten most frequently occurring words.\n\n Parameters:\n url (str): The URL from which the text file is to be downloaded. The URL should point directly to a text file.\n\n Returns:\n tuple: A tuple containing two elements:\n - Counter: A Counter object from the collections module, containing word frequencies in the text.\n - Axes: A matplotlib Axes object that represents the plotted bar chart of the ten most common words.\n\n Note:\n - The function assumes the URL points to a plain text file and may not handle binary files or non-text content correctly.\n - Words are identified using a basic regular expression and are case-sensitive.\n - The function does not remove common stopwords; all words are counted as is.\n - Requires internet access to download the file from the URL.\n\n Example:\n >>> word_freq, ax = f_847('http://www.example.com/data.txt')\n >>> print(word_freq.most_common(5))\n [('the', 102), ('of', 76), ('and', 64), ('to', 52), ('in', 41)]\n\n Requirements:\n - urllib\n - re\n - collections\n - matplotlib\n \n \"\"\"", "canonical_solution": " with urllib.request.urlopen(url) as response:\n text = response.read().decode()\n words = re.findall(r\"\\b\\w+\\b\", text)\n word_freq = Counter(words)\n top_words = word_freq.most_common(10)\n\n _, ax = plt.subplots()\n ax.bar([word[0] for word in top_words], [word[1] for word in top_words])\n ax.set_title(\"Top 10 Most Common Words\")\n ax.set_xlabel(\"Words\")\n ax.set_ylabel(\"Frequency\")\n\n return word_freq, ax", "test": "import unittest\nfrom unittest.mock import patch\nfrom collections import Counter\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_847 function.\"\"\"\n @patch(\"urllib.request.urlopen\")\n def test_word_frequencies(self, mock_urlopen):\n \"\"\"Test that the function returns the correct word frequencies.\"\"\"\n # Mock the response data\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n b\"OpenAI OpenAI OpenAI benefits\"\n )\n word_freq, ax = f_847(\"http://example.com\")\n self.assertIsInstance(word_freq, Counter)\n self.assertEqual(word_freq[\"OpenAI\"], 3)\n self.assertEqual(word_freq[\"benefits\"], 1)\n self.assertIsNotNone(ax)\n @patch(\"urllib.request.urlopen\")\n def test_empty_file(self, mock_urlopen):\n \"\"\"Test that the function returns an empty Counter object for an empty file.\"\"\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = b\"\"\n word_freq, ax = f_847(\"http://example.com\")\n self.assertIsInstance(word_freq, Counter)\n self.assertEqual(len(word_freq), 0)\n self.assertIsNotNone(ax)\n @patch(\"urllib.request.urlopen\")\n def test_non_text_file(self, mock_urlopen):\n \"\"\"Test that the function raises an error for a non-text file.\"\"\"\n # Simulate a case where the URL does not point to a text file\n mock_urlopen.side_effect = Exception(\"Non-text file error\")\n with self.assertRaises(Exception):\n f_847(\"http://example.com\")\n @patch(\"urllib.request.urlopen\")\n def test_special_characters(self, mock_urlopen):\n \"\"\"Test that the function counts special characters as words.\"\"\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n b\"1234567890\"\n )\n word_freq, ax = f_847(\"http://example.com\")\n self.assertIsInstance(word_freq, Counter)\n self.assertEqual(word_freq[\"1234567890\"], 1)\n self.assertIsNotNone(ax)\n @patch(\"urllib.request.urlopen\")\n def test_large_input(self, mock_urlopen):\n \"\"\"Test that the function can handle a large input.\"\"\"\n # Mock a large input\n mock_text = \" \".join([\"OpenAI\"] * 10000)\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n mock_text.encode()\n )\n word_freq, ax = f_847(\"http://example.com\")\n self.assertIsInstance(word_freq, Counter)\n self.assertEqual(word_freq[\"OpenAI\"], 10000)\n self.assertIsNotNone(ax)\n def tearDown(self):\n plt.clf()", "apis": ["urllib.request", "collections.Counter", "urllib.request.urlopen", "re.findall", "matplotlib.pyplot.subplots"], "libs": ["re", "collections", "urllib", "matplotlib"], "doc": {"description": ["Downloads a text file from a specified URL, processes the text to count the frequency of each word,", "and then plots a bar chart showing the ten most frequently occurring words."], "note": ["The function assumes the URL points to a plain text file and may not handle binary files or non-text content correctly.", "Words are identified using a basic regular expression and are case-sensitive.", "The function does not remove common stopwords; all words are counted as is.", "Requires internet access to download the file from the URL."], "params": ["url (str): The URL from which the text file is to be downloaded. The URL should point directly to a text file."], "returns": ["tuple: A tuple containing two elements:", "Counter: A Counter object from the collections module, containing word frequencies in the text.", "Axes: A matplotlib Axes object that represents the plotted bar chart of the ten most common words."], "reqs": ["urllib", "re", "collections", "matplotlib"], "raises": [], "example": [">>> word_freq, ax = f_847('http://www.example.com/data.txt')", ">>> print(word_freq.most_common(5))", "[('the', 102), ('of', 76), ('and', 64), ('to', 52), ('in', 41)]"]}} +{"task_id": "f_534", "prompt": "import os\nimport random\n\ndef f_534(directory, n_files):\n \"\"\"\n Create n random txt files in a specific directory, write only a single digit random integer into each file, and then reset the cursor to the beginning of each file.\n\n Parameters:\n - directory (str): The directory in which to generate the files.\n - n_files (int): The number of files to generate.\n\n Returns:\n - n_files (int): The number of files generated.\n\n Requirements:\n - os\n - random\n\n Example:\n >>> random.seed(2)\n >>> f_534('/path/to/directory', 5)\n 5\n \"\"\"", "canonical_solution": " if not os.path.exists(directory):\n os.makedirs(directory)\n\n for i in range(n_files):\n filename = os.path.join(directory, f\"file_{i+1}.txt\")\n\n with open(filename, 'w') as file:\n file.write(str(random.randint(0, 9)))\n file.seek(0)\n\n return n_files", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def base(self, dir, n_files, contents):\n random.seed(42)\n # Create directory\n if not os.path.exists(dir):\n os.makedirs(dir)\n # Run function\n n = f_534(dir, n_files)\n # Check files\n self.assertEqual(n, n_files)\n read_data = []\n for f in sorted(os.listdir(dir)):\n self.assertTrue(f.endswith('.txt'))\n with open(os.path.join(dir, f), 'r') as file:\n read_data.append(file.read())\n file.seek(0)\n self.assertEqual(read_data, contents)\n def tearDown(self):\n shutil.rmtree('./directory', ignore_errors=True)\n shutil.rmtree('./dir', ignore_errors=True)\n shutil.rmtree('./d', ignore_errors=True)\n def test_case_1(self):\n self.base('./directory', 5, ['1', '0', '4', '3', '3'])\n def test_case_2(self):\n self.base('./dir', 10, ['1', '9', '0', '4', '3', '3', '2', '1', '8', '1'])\n def test_case_3(self):\n self.base('./d', 15, ['1', '9', '6', '0', '0', '1', '3', '0', '4', '3', '3', '2', '1', '8', '1'])\n def test_case_4(self):\n self.base('./d', 20, ['1', '9', '6', '0', '0', '1', '3', '3', '8', '9', '0', '0', '8', '4', '3', '3', '2', '1', '8', '1'])\n def test_case_5(self):\n self.base('./directory', 25, ['1', '9', '6', '0', '0', '1', '3', '3', '8', '9', '0', '0', '8', '3', '8', '6', '3', '7', '4', '3', '3', '2', '1', '8', '1'])", "apis": ["os.makedirs", "random.randint", "os.path", "os.path.join", "os.path.exists"], "libs": ["os", "random"], "doc": {"description": ["Create n random txt files in a specific directory, write only a single digit random integer into each file, and then reset the cursor to the beginning of each file."], "note": [], "params": ["directory (str): The directory in which to generate the files.", "n_files (int): The number of files to generate."], "returns": ["n_files (int): The number of files generated."], "reqs": ["os", "random"], "raises": [], "example": [">>> random.seed(2)", ">>> f_534('/path/to/directory', 5)", "5"]}} +{"task_id": "f_422", "prompt": "import sqlite3\nimport pandas as pd\nimport os\n\n\ndef f_422(db_name, table_name, csv_path=\"data.csv\"):\n \"\"\"\n Read SQLite3 table via pandas and export to a CSV file.\n\n Parameters:\n - db_name (str): The path to the SQLite3 database.\n - table_name (str): The name of the table to export.\n - csv_path (str, optional): The path where the CSV file will be saved. Defaults to 'data.csv'.\n\n Requirements:\n - sqlite3\n - pandas\n - os\n\n Returns:\n str: The absolute path of the exported CSV file.\n\n Example:\n >>> f_422('test.db', 'People')\n 'data.csv'\n >>> f_422('/absolute/path/to/test.db', 'Orders', 'orders.csv')\n '/absolute/path/to/orders.csv'\n \"\"\"", "canonical_solution": " try:\n conn = sqlite3.connect(db_name)\n df = pd.read_sql_query(f\"SELECT * from {table_name}\", conn)\n df.to_csv(csv_path, index=False)\n return os.path.abspath(csv_path)\n finally:\n conn.close()", "test": "import unittest\nimport os\nimport tempfile\nimport shutil\nimport sqlite3\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.temp_dir_obj = tempfile.TemporaryDirectory()\n cls.temp_dir = cls.temp_dir_obj.name\n cls.db_path = os.path.join(cls.temp_dir, \"test.db\")\n # Setup the database and tables\n conn = sqlite3.connect(cls.db_path)\n cursor = conn.cursor()\n # Create tables and insert some data\n cursor.execute(\"CREATE TABLE People (Name TEXT, Age INTEGER)\")\n cursor.execute(\n \"INSERT INTO People VALUES ('Alice', 30), ('Bob', 25), ('Charlie', 35)\"\n )\n cursor.execute(\"CREATE TABLE Orders (Product TEXT, Quantity INTEGER)\")\n cursor.execute(\n \"INSERT INTO Orders VALUES ('Widgets', 5), ('Gadgets', 10), ('Doodads', 15)\"\n )\n conn.commit()\n conn.close()\n @classmethod\n def tearDownClass(cls):\n cls.temp_dir_obj.cleanup()\n def test_case_1(self):\n # Test exporting the People table\n csv_path = os.path.join(self.temp_dir, \"data.csv\")\n output_path = f_422(self.db_path, \"People\", csv_path)\n self.assertTrue(os.path.exists(output_path), \"CSV file not created.\")\n df = pd.read_csv(output_path)\n self.assertEqual(len(df), 3, \"CSV contains incorrect number of rows.\")\n self.assertTrue(\"Alice\" in df[\"Name\"].values, \"Expected data not found in CSV.\")\n def test_case_2(self):\n # Test exporting the Orders table\n csv_path = os.path.join(self.temp_dir, \"orders.csv\")\n output_path = f_422(self.db_path, \"Orders\", csv_path)\n self.assertTrue(os.path.exists(output_path), \"CSV file not created.\")\n df = pd.read_csv(output_path)\n self.assertEqual(len(df), 3, \"CSV contains incorrect number of rows.\")\n self.assertTrue(5 in df[\"Quantity\"].values, \"Expected data not found in CSV.\")\n def test_case_3(self):\n # Test exporting with a custom CSV path\n custom_path = os.path.join(self.temp_dir, \"custom_data.csv\")\n output_path = f_422(self.db_path, \"People\", custom_path)\n self.assertTrue(\n os.path.exists(output_path), \"CSV file not created at custom path.\"\n )\n self.assertEqual(\n output_path,\n os.path.abspath(custom_path),\n \"Returned path does not match expected path.\",\n )\n def test_case_4(self):\n # Test with a non-existent database\n with self.assertRaises(Exception):\n f_422(os.path.join(self.temp_dir, \"nonexistent.db\"), \"People\")\n def test_case_5(self):\n # Test with a non-existent table\n with self.assertRaises(pd.io.sql.DatabaseError):\n f_422(self.db_path, \"NonexistentTable\")\n def test_case_6(self):\n # Test if the function overwrites an existing CSV file\n csv_path = os.path.join(self.temp_dir, \"data.csv\")\n with open(csv_path, \"w\") as file:\n file.write(\"Old Content\")\n output_path = f_422(self.db_path, \"People\", csv_path)\n self.assertTrue(os.path.exists(output_path), \"CSV file not created.\")\n with open(output_path, \"r\") as file:\n content = file.read()\n self.assertNotEqual(\n \"Old Content\", content, \"Old content found in CSV. Overwriting failed.\"\n )\n def test_case_7(self):\n # Test error handling with invalid CSV path\n with self.assertRaises(OSError):\n f_422(self.db_path, \"People\", \"/nonexistent_path/data.csv\")", "apis": ["os.path", "os.path.abspath", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["os", "pandas", "sqlite3"], "doc": {"description": ["Read SQLite3 table via pandas and export to a CSV file."], "note": [], "params": ["db_name (str): The path to the SQLite3 database.", "table_name (str): The name of the table to export.", "csv_path (str, optional): The path where the CSV file will be saved. Defaults to 'data.csv'."], "returns": ["str: The absolute path of the exported CSV file."], "reqs": ["sqlite3", "pandas", "os"], "raises": [], "example": [">>> f_422('test.db', 'People')", "'data.csv'", ">>> f_422('/absolute/path/to/test.db', 'Orders', 'orders.csv')", "'/absolute/path/to/orders.csv'"]}} +{"task_id": "f_541", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.preprocessing import StandardScaler\n\ndef f_541(df, features):\n \"\"\"\n Standardize the functions in a DataFrame.\n The function applies standard scaling to the features.\n \n Parameters:\n - df (pandas.DataFrame): The input DataFrame.\n - features (list): The list of features to standardize. May be empty.\n \n Returns:\n - df (pandas.DataFrame): The DataFrame with the standardized features.\n\n Requirements:\n - pandas\n - numpy\n - scikit-learn\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c'])\n >>> df = f_541(df, ['a', 'b'])\n >>> print(df)\n a b c\n 0 0.608932 0.127900 0.647689\n 1 2.025355 0.031682 -0.234137\n 2 2.102894 1.036701 -0.469474\n 3 0.672204 -0.198368 -0.465730\n 4 0.257348 -1.653196 -1.724918\n 5 -0.852601 -0.749663 0.314247\n 6 -1.329753 -1.150504 1.465649\n 7 -0.388180 0.334397 -1.424748\n 8 -0.827890 0.377940 -1.150994\n 9 0.441917 -0.336059 -0.291694\n 10 -0.907003 2.125260 -0.013497\n 11 -1.536337 1.092000 -1.220844\n 12 0.211669 -1.699745 -1.328186\n 13 0.195104 1.007633 0.171368\n 14 -0.236192 -0.035498 -1.478522\n 15 -1.070045 -0.195579 1.057122\n 16 0.397644 -1.502441 0.324084\n 17 -0.608039 -0.412603 0.611676\n 18 1.346302 1.201107 -0.839218\n 19 -0.503330 0.599035 0.975545\n \"\"\"", "canonical_solution": " if not features:\n return df\n\n # Initialize the StandardScaler\n scaler = StandardScaler()\n \n # Apply StandardScaler to the specified features\n # Using pd.DataFrame to explicitly reference DataFrame operations\n df.loc[:, features] = pd.DataFrame(scaler.fit_transform(df.loc[:, features]), columns=features, index=df.index)\n\n # Example of explicit np usage, even though not necessary for this function\n # Just for demonstration: add a dummy operation using np\n df['dummy'] = np.zeros(len(df))\n\n return df.drop('dummy', axis=1) ", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])\n df = f_541(df, ['a', 'b'])\n self.assertEqual(df.shape, (10, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] >= -3) and np.all(df['a'] <= 3))\n self.assertTrue(np.all(df['b'] >= -3) and np.all(df['b'] <= 3))\n self.assertTrue(np.all(df['c'] >= -3) and np.all(df['c'] <= 3))\n def test_case_2(self):\n df = pd.DataFrame({'a': [0, 0, 0], 'b': [0, 0, 0], 'c': [0, 0, 0]})\n df = f_541(df, ['a', 'b'])\n self.assertEqual(df.shape, (3, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] == 0))\n self.assertTrue(np.all(df['b'] == 0))\n self.assertTrue(np.all(df['c'] == 0))\n def test_case_3(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n df = f_541(df, ['a', 'b'])\n self.assertEqual(df.shape, (3, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] >= -3) and np.all(df['a'] <= 3))\n self.assertTrue(np.all(df['b'] >= -3) and np.all(df['b'] <= 3))\n self.assertTrue(np.all(df['c'] == [7, 8, 9]))\n def test_case_4(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n df = f_541(df, ['c'])\n self.assertEqual(df.shape, (3, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] == [1, 2, 3]))\n self.assertTrue(np.all(df['b'] == [4, 5, 6]))\n self.assertTrue(np.all(df['c'] >= -3) and np.all(df['c'] <= 3))\n def test_case_5(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n df = f_541(df, [])\n self.assertEqual(df.shape, (3, 3))\n self.assertTrue('a' in df.columns)\n self.assertTrue('b' in df.columns)\n self.assertTrue('c' in df.columns)\n self.assertTrue(np.all(df['a'] == [1, 2, 3]))\n self.assertTrue(np.all(df['b'] == [4, 5, 6]))\n self.assertTrue(np.all(df['c'] == [7, 8, 9]))", "apis": ["pandas.DataFrame", "sklearn.preprocessing.StandardScaler", "numpy.zeros"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Standardize the functions in a DataFrame.", "The function applies standard scaling to the features."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame.", "features (list): The list of features to standardize. May be empty."], "returns": ["df (pandas.DataFrame): The DataFrame with the standardized features."], "reqs": ["pandas", "numpy", "scikit-learn"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c'])", ">>> df = f_541(df, ['a', 'b'])", ">>> print(df)", "a b c", "0 0.608932 0.127900 0.647689", "1 2.025355 0.031682 -0.234137", "2 2.102894 1.036701 -0.469474", "3 0.672204 -0.198368 -0.465730", "4 0.257348 -1.653196 -1.724918", "5 -0.852601 -0.749663 0.314247", "6 -1.329753 -1.150504 1.465649", "7 -0.388180 0.334397 -1.424748", "8 -0.827890 0.377940 -1.150994", "9 0.441917 -0.336059 -0.291694", "10 -0.907003 2.125260 -0.013497", "11 -1.536337 1.092000 -1.220844", "12 0.211669 -1.699745 -1.328186", "13 0.195104 1.007633 0.171368", "14 -0.236192 -0.035498 -1.478522", "15 -1.070045 -0.195579 1.057122", "16 0.397644 -1.502441 0.324084", "17 -0.608039 -0.412603 0.611676", "18 1.346302 1.201107 -0.839218", "19 -0.503330 0.599035 0.975545"]}} {"task_id": "f_530", "prompt": "import itertools\nimport math\n\ndef f_530(x):\n \"\"\"\n Find the key pair in a dictionary, x, which has the highest sum of the cosine of each of its values.\n\n Parameters:\n - x (dict): The dictionary of key-value pairs.\n\n Returns:\n - tuple: The pair of keys with the highest sum of the cosine of their values.\n\n Requirements:\n - itertools\n - math\n\n Example:\n >>> f_530({'a': 1, 'b': 2, 'c': 3})\n ('a', 'b')\n ('a', 'b')\n >>> f_530({'a': 1, 'b': 2, 'c': 3, 'd': 4})\n ('a', 'b')\n ('a', 'b')\n \"\"\"", "canonical_solution": " pairs = list(itertools.combinations(x.keys(), 2))\n max_pair = max(pairs, key=lambda pair: math.cos(x[pair[0]]) + math.cos(x[pair[1]]))\n print(max_pair)\n\n return max_pair", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(sorted(f_530({'a': 1, 'b': 2, 'c': 3})), sorted(('a', 'b')))\n \n def test_case_2(self):\n self.assertEqual(sorted(f_530({'a': 1, 'b': 2, 'c': 3, 'd': 4})), sorted(('a', 'b')))\n def test_case_3(self):\n self.assertEqual( sorted(f_530({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})), sorted(('e', 'a')))\n def test_case_4(self):\n self.assertEqual( sorted(f_530({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6})), sorted(('f', 'a')))\n def test_case_5(self):\n self.assertEqual( sorted(f_530({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7})), sorted(('g', 'f')))", "apis": ["math.cos", "itertools.combinations"], "libs": ["itertools", "math"], "doc": {"description": ["Find the key pair in a dictionary, x, which has the highest sum of the cosine of each of its values."], "note": [], "params": ["x (dict): The dictionary of key-value pairs."], "returns": ["tuple: The pair of keys with the highest sum of the cosine of their values."], "reqs": ["itertools", "math"], "raises": [], "example": [">>> f_530({'a': 1, 'b': 2, 'c': 3})", "('a', 'b')", "('a', 'b')", ">>> f_530({'a': 1, 'b': 2, 'c': 3, 'd': 4})", "('a', 'b')", "('a', 'b')"]}} -{"task_id": "f_334", "prompt": "import pandas as pd\nfrom sklearn.linear_model import LinearRegression\nimport matplotlib.pyplot as plt\n\n\ndef f_334(df1, df2, features=[\"feature1\", \"feature2\", \"feature3\"], target=\"target\"):\n \"\"\"\n Perform linear regression analysis with specified characteristics and targets.\n The function should merge two dataframes based on the 'id' column, perform\n linear regression using columns specified in features to predict the target,\n and plot the residuals.\n\n Parameters:\n - df1 (DataFrame): The first dataframe containing columns 'id' and the features specified.\n - df2 (DataFrame): The second dataframe containing columns 'id' and target.\n - features (list of str, optional): List of feature column names. Default is ['feature1', 'feature2', 'feature3'].\n - target (str, optional): Name of the target column. Default is 'target'.\n\n Returns:\n dict: A dictionary containing:\n - 'coefficients': Regression coefficients (list).\n - 'intercept': Regression intercept (float).\n - 'residuals_plot': A matplotlib Axes object representing the residuals plot.\n\n Requirements:\n - pandas\n - sklearn.linear_model.LinearRegression\n - matplotlib.pyplot\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7], 'feature3': [3.4, 5.6, 7.8]})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'target': [4.5, 6.7, 8.9]})\n >>> result = f_334(df1, df2)\n >>> result['coefficients']\n [0.3333333333333334, 0.33333333333333354, 0.3333333333333335]\n >>> type(result['residuals_plot'])\n \n \"\"\"", "canonical_solution": " df = pd.merge(df1, df2, on=\"id\")\n X = df[features]\n y = df[target]\n model = LinearRegression()\n model.fit(X, y)\n y_pred = model.predict(X)\n residuals = y - y_pred\n fig, ax = plt.subplots()\n ax.scatter(y_pred, residuals) # scatter plot of residuals\n ax.axhline(y=0, color=\"r\", linestyle=\"-\") # horizontal line at y=0\n ax.set_xlabel(\"Predicted Values\")\n ax.set_ylabel(\"Residuals\")\n ax.set_title(\"Residuals Plot\")\n return {\n \"coefficients\": list(model.coef_),\n \"intercept\": model.intercept_,\n \"residuals_plot\": ax,\n }", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\nclass TestCases(unittest.TestCase):\n # Setting up sample data for some test cases\n def setUp(self):\n self.df1_sample = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, 3],\n \"feature2\": [1, 2, 3],\n \"feature3\": [1, 2, 3],\n }\n )\n self.df2_sample = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [6, 15, 24]})\n def tearDown(self):\n plt.close(\"all\")\n # Test if the function returns the correct coefficients and intercept\n def test_case_1(self):\n result = f_334(self.df1_sample, self.df2_sample)\n for coef_actual, coef_expected in zip(result[\"coefficients\"], [3.0, 3.0, 3.0]):\n self.assertAlmostEqual(coef_actual, coef_expected, places=7)\n self.assertAlmostEqual(result[\"intercept\"], -3.0, places=7)\n # Test if the function returns the residuals plot\n def test_case_2(self):\n result = f_334(self.df1_sample, self.df2_sample)\n self.assertTrue(isinstance(result[\"residuals_plot\"], plt.Axes))\n # Test if the residuals plot contains the right number of data points\n def test_case_3(self):\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [2, 4, 6],\n \"feature2\": [2, 4, 6],\n \"feature3\": [2, 4, 6],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [12, 30, 48]})\n result = f_334(df1, df2)\n self.assertEqual(len(result[\"residuals_plot\"].collections), 1)\n # Test if the intercept of the model is correct\n def test_case_4(self):\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, 3],\n \"feature2\": [4, 5, 6],\n \"feature3\": [7, 8, 9],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [10, 11, 12]})\n result = f_334(df1, df2)\n self.assertAlmostEqual(result[\"intercept\"], 6.0, places=7)\n # Test the coefficients and intercept for a different set of data\n def test_case_5(self):\n result = f_334(self.df1_sample, self.df2_sample)\n for coef_actual, coef_expected in zip(result[\"coefficients\"], [3.0, 3.0, 3.0]):\n self.assertAlmostEqual(coef_actual, coef_expected, places=7)\n self.assertAlmostEqual(result[\"intercept\"], -3.0, places=7)\n # Test the coefficients and intercept against sklearn's LinearRegression for verification\n def test_case_6(self):\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n \"feature1\": list(range(10)),\n \"feature2\": list(range(10, 20)),\n \"feature3\": list(range(20, 30)),\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], \"target\": list(range(30, 40))}\n )\n result = f_334(df1, df2)\n model = LinearRegression().fit(\n df1[[\"feature1\", \"feature2\", \"feature3\"]], df2[\"target\"]\n )\n expected_coefficients = model.coef_\n expected_intercept = model.intercept_\n self.assertListEqual(result[\"coefficients\"], list(expected_coefficients))\n self.assertEqual(result[\"intercept\"], expected_intercept)\n # Test the residuals plot's title and grid properties\n def test_case_7(self):\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, 3],\n \"feature2\": [4, 5, 6],\n \"feature3\": [7, 8, 9],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [10, 11, 12]})\n result = f_334(df1, df2)\n self.assertEqual(result[\"residuals_plot\"].get_title(), \"Residuals Plot\")\n self.assertTrue(result[\"residuals_plot\"].grid)\n self.assertEqual(len(result[\"residuals_plot\"].lines), 1)", "apis": ["matplotlib.pyplot.subplots", "sklearn.linear_model.LinearRegression", "pandas.merge"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["Perform linear regression analysis with specified characteristics and targets.", "The function should merge two dataframes based on the 'id' column, perform", "linear regression using columns specified in features to predict the target,", "and plot the residuals."], "note": [], "params": ["df1 (DataFrame): The first dataframe containing columns 'id' and the features specified.", "df2 (DataFrame): The second dataframe containing columns 'id' and target.", "features (list of str, optional): List of feature column names. Default is ['feature1', 'feature2', 'feature3'].", "target (str, optional): Name of the target column. Default is 'target'."], "returns": ["dict: A dictionary containing:", "'coefficients': Regression coefficients (list).", "'intercept': Regression intercept (float).", "'residuals_plot': A matplotlib Axes object representing the residuals plot."], "reqs": ["pandas", "sklearn.linear_model.LinearRegression", "matplotlib.pyplot"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7], 'feature3': [3.4, 5.6, 7.8]})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'target': [4.5, 6.7, 8.9]})", ">>> result = f_334(df1, df2)", ">>> result['coefficients']", "[0.3333333333333334, 0.33333333333333354, 0.3333333333333335]", ">>> type(result['residuals_plot'])", ""]}} -{"task_id": "f_752", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\ndef f_752(letters, repetitions, colors):\n \"\"\"\n Create a bar chart to visualize the frequency of each letter in a flattened list \n formed by multiple repetitions of the original list. Each repetition of the list \n is associated with a different color in the chart.\n \n Note:\n - Generate a bar chart for the frequency of letters, where each letter's frequency\n is determined by its number of repetitions.\n - Each letter's bar in the chart is colored according to the specified color.\n - The length of the list `colors` should match the number of repetitions of `letters`.\n - The lists 'letters' and 'colors' cannot be empty.\n \n Input:\n - letters (list of str): A list of unique letters to be visualized.\n - repetitions (list of int): A list of the number of times each letter is repeated.\n Must be the same length as `letters`.\n - colors (list of str): A list of colors for the bars corresponding to each letter.\n Must be the same length as `letters`.\n \n Output:\n - Returns the Matplotlib Axes object representing the created bar chart.\n \n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue'])\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if len(letters) != len(repetitions) or len(letters) != len(colors) or len(letters) == 0:\n raise ValueError(\"All lists must be the same length and non-empty.\")\n \n # Count the frequency of each letter based on repetitions\n counts = np.array(repetitions)\n \n # Create the bar chart\n fig, ax = plt.subplots()\n ax.bar(letters, counts, color=colors)\n ax.set_xlabel('Letters')\n ax.set_ylabel('Frequency')\n ax.set_title('Frequency of Letters')\n \n return ax", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_basic_input(self):\n ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue'])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Frequency of Letters\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n expected_colors = ['red', 'green', 'blue']\n for patch, expected_color in zip(ax.patches, expected_colors):\n self.assertEqual(patch.get_facecolor(), plt.cm.colors.to_rgba(expected_color))\n expected_counts = [3, 5, 2]\n for patch, expected_count in zip(ax.patches, expected_counts):\n self.assertEqual(patch.get_height(), expected_count)\n \n def test_invalid_input_length(self):\n with self.assertRaises(ValueError):\n f_752(['A', 'B'], [3], ['red', 'green'])\n \n def test_empty_lists(self):\n with self.assertRaises(ValueError):\n f_752([], [], [])\n \n def test_single_letter(self):\n ax = f_752(['Z'], [1], ['purple'])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Frequency of Letters\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n self.assertEqual(ax.patches[0].get_facecolor(), plt.cm.colors.to_rgba('purple'))\n self.assertEqual(ax.patches[0].get_height(), 1)\n \n def test_multiple_repetitions(self):\n ax = f_752(['D', 'E', 'F'], [10, 20, 15], ['cyan', 'magenta', 'yellow'])\n self.assertIsInstance(ax, plt.Axes)\n expected_counts = [10, 20, 15]\n for patch, expected_count in zip(ax.patches, expected_counts):\n self.assertEqual(patch.get_height(), expected_count)", "apis": ["numpy.array", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Create a bar chart to visualize the frequency of each letter in a flattened list", "formed by multiple repetitions of the original list. Each repetition of the list", "is associated with a different color in the chart.", "Input:", "- letters (list of str): A list of unique letters to be visualized.", "- repetitions (list of int): A list of the number of times each letter is repeated.", "Must be the same length as `letters`.", "- colors (list of str): A list of colors for the bars corresponding to each letter.", "Must be the same length as `letters`.", "Output:", "- Returns the Matplotlib Axes object representing the created bar chart."], "note": ["Generate a bar chart for the frequency of letters, where each letter's frequency", "is determined by its number of repetitions.", "Each letter's bar in the chart is colored according to the specified color.", "The length of the list `colors` should match the number of repetitions of `letters`.", "The lists 'letters' and 'colors' cannot be empty."], "params": [], "returns": [], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue'])", ">>> type(ax)", ""]}} -{"task_id": "f_838", "prompt": "import os\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n\ndef f_838(file_path: str, plot_path: str) -> (float, float, str):\n \"\"\"\n Processes a CSV file at the given path by reading its contents, cleaning the data,\n performing statistical analysis, and generating a plot, which is saved to the specified path.\n\n Sets the title of the plot to \"Data Visualization\".\n Labels the x-axis as \"Index\" and the y-axis as \"Value\".\n Saves the generated plot to the file path specified in 'plot_path'.\n\n Parameters:\n - file_path (str): Path to the CSV input file.\n - plot_path (str): Path where the plot will be saved.\n\n Returns:\n - tuple: A tuple containing the following elements:\n - Mean (float): The average value of the data. Returns NaN if data is empty or non-numeric.\n - Median (float): The middle value of the data when sorted. Returns NaN if data is empty or non-numeric.\n - Plot Path (str): The path where the plot is saved.\n\n Raises:\n - FileNotFoundError: If the CSV file at 'file_path' does not exist.\n\n Requirements:\n - os\n - pandas\n - matplotlib\n - numpy\n\n Example:\n >>> f_838(\"sample_data.csv\", \"output_plot.png\")\n (25.5, 23.0, \"output_plot.png\")\n \"\"\"", "canonical_solution": " # Check if file exists\n if not os.path.isfile(file_path):\n raise FileNotFoundError(f\"File {file_path} does not exist.\")\n\n # Load data and handle empty file\n try:\n data = pd.read_csv(file_path)\n except pd.errors.EmptyDataError:\n return np.nan, np.nan, plot_path\n\n # Convert data to numeric, coerce errors to NaN\n data = pd.to_numeric(data.squeeze(), errors=\"coerce\")\n\n # Ensure data is a Pandas Series\n if not isinstance(data, pd.Series):\n data = pd.Series(data)\n\n # Clean data\n data = data.dropna()\n\n # Perform analysis\n if data.empty:\n mean = median = np.nan\n else:\n # Calculate mean and median\n mean = float(np.mean(data))\n median = float(np.median(data))\n\n # Create plot and save it\n plt.figure(figsize=(10, 6))\n plt.plot(data)\n plt.title(\"Data Visualization\")\n plt.xlabel(\"Index\")\n plt.ylabel(\"Value\")\n plt.savefig(plot_path)\n plt.close()\n\n return mean, median, plot_path", "test": "import unittest\nimport os\nimport numpy as np\nimport pandas as pd\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_838 function.\"\"\"\n def setUp(self):\n # Create a directory for test files if it doesn't exist\n self.test_dir = \"mnt/data/f_838_data_test\"\n os.makedirs(self.test_dir, exist_ok=True)\n # Create a valid data file\n self.valid_data_path = os.path.join(self.test_dir, \"valid_data.csv\")\n pd.DataFrame({\"data\": np.random.rand(100)}).to_csv(\n self.valid_data_path, index=False\n )\n # Create an empty data file\n self.empty_data_path = os.path.join(self.test_dir, \"empty_data.csv\")\n with open(self.empty_data_path, \"w\") as f:\n f.write(\"\")\n # Create a non-numeric data file\n self.non_numeric_data_path = os.path.join(self.test_dir, \"non_numeric_data.csv\")\n pd.DataFrame({\"data\": [\"a\", \"b\", \"c\", \"d\"]}).to_csv(\n self.non_numeric_data_path, index=False\n )\n # Create a large data file\n self.large_data_path = os.path.join(self.test_dir, \"large_data.csv\")\n pd.DataFrame({\"data\": np.random.rand(10000)}).to_csv(\n self.large_data_path, index=False\n )\n # Create a data file with NaN values\n self.nan_data_path = os.path.join(self.test_dir, \"nan_data.csv\")\n pd.DataFrame({\"data\": [1, np.nan, 2, np.nan, 3]}).to_csv(\n self.nan_data_path, index=False\n )\n # Create a data file with a single value\n self.single_value_path = os.path.join(self.test_dir, \"single_value.csv\")\n pd.DataFrame({\"data\": [42]}).to_csv(self.single_value_path, index=False)\n # Create a data file where all values are NaN\n self.all_nan_path = os.path.join(self.test_dir, \"all_nan.csv\")\n pd.DataFrame({\"data\": [np.nan, np.nan, np.nan]}).to_csv(\n self.all_nan_path, index=False\n )\n def test_valid_input(self):\n \"\"\"Test that the function runs without errors and returns the correct output.\"\"\"\n plot_path = os.path.join(self.test_dir, \"valid_plot.png\")\n mean, median, plot_path = f_838(self.valid_data_path, plot_path)\n self.assertIsInstance(mean, float)\n self.assertIsInstance(median, float)\n self.assertTrue(os.path.exists(plot_path))\n def test_file_not_found(self):\n \"\"\"Test that the function raises a FileNotFoundError when the specified file does not exist.\"\"\"\n plot_path = os.path.join(self.test_dir, \"not_found_plot.png\")\n with self.assertRaises(FileNotFoundError):\n f_838(os.path.join(self.test_dir, \"non_existent_file.csv\"), plot_path)\n def test_empty_file(self):\n \"\"\"Test that the function returns NaN for mean and median when the file is empty.\"\"\"\n plot_path = os.path.join(self.test_dir, \"empty_plot.png\")\n mean, median, returned_plot_path = f_838(self.empty_data_path, plot_path)\n self.assertTrue(np.isnan(mean))\n self.assertTrue(np.isnan(median))\n self.assertFalse(\n os.path.exists(returned_plot_path)\n ) # Plot should not exist for empty file\n def test_non_numeric_data(self):\n \"\"\"Test that the function returns NaN for mean and median when the file contains non-numeric data.\"\"\"\n plot_path = os.path.join(self.test_dir, \"non_numeric_plot.png\")\n mean, median, returned_plot_path = f_838(self.non_numeric_data_path, plot_path)\n self.assertTrue(np.isnan(mean))\n self.assertTrue(np.isnan(median))\n self.assertTrue(os.path.exists(returned_plot_path))\n def test_large_data(self):\n \"\"\"Test that the function runs without errors and returns the correct output for a large data file.\"\"\"\n plot_path = os.path.join(self.test_dir, \"large_data_plot.png\")\n mean, median, returned_plot_path = f_838(self.large_data_path, plot_path)\n self.assertIsInstance(mean, float)\n self.assertIsInstance(median, float)\n self.assertTrue(os.path.exists(returned_plot_path))\n def test_data_with_nan_values(self):\n \"\"\"Test that the function returns the correct output for a data file with NaN values.\"\"\"\n plot_path = os.path.join(self.test_dir, \"nan_data_plot.png\")\n mean, median, returned_plot_path = f_838(self.nan_data_path, plot_path)\n self.assertNotEqual(mean, np.nan)\n self.assertNotEqual(median, np.nan)\n self.assertTrue(os.path.exists(returned_plot_path))\n def test_single_value_data(self):\n \"\"\"Test that the function returns the correct output for a data file with a single value.\"\"\"\n plot_path = os.path.join(self.test_dir, \"single_value_plot.png\")\n mean, median, returned_plot_path = f_838(self.single_value_path, plot_path)\n self.assertEqual(mean, 42)\n self.assertEqual(median, 42)\n self.assertTrue(os.path.exists(returned_plot_path))\n def test_all_nan_data(self):\n \"\"\"Test that the function returns NaN for mean and median when the file contains all NaN values.\"\"\"\n plot_path = os.path.join(self.test_dir, \"all_nan_plot.png\")\n mean, median, returned_plot_path = f_838(self.all_nan_path, plot_path)\n self.assertTrue(np.isnan(mean))\n self.assertTrue(np.isnan(median))\n self.assertTrue(os.path.exists(returned_plot_path))\n def tearDown(self):\n # Remove all created files\n plt.clf()\n for filename in os.listdir(self.test_dir):\n file_path = os.path.join(self.test_dir, filename)\n if os.path.isfile(file_path) or os.path.islink(file_path):\n os.remove(file_path)\n # Remove the test directory\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["pandas.read_csv", "numpy.median", "numpy.mean", "matplotlib.pyplot.xlabel", "matplotlib.pyplot.plot", "pandas.Series", "numpy.nan", "os.path", "matplotlib.pyplot.title", "matplotlib.pyplot.close", "matplotlib.pyplot.savefig", "pandas.errors", "matplotlib.pyplot.figure", "matplotlib.pyplot.ylabel", "pandas.to_numeric", "os.path.isfile"], "libs": ["numpy", "pandas", "os", "matplotlib"], "doc": {"description": ["Processes a CSV file at the given path by reading its contents, cleaning the data,", "performing statistical analysis, and generating a plot, which is saved to the specified path.", "Sets the title of the plot to \"Data Visualization\".", "Labels the x-axis as \"Index\" and the y-axis as \"Value\".", "Saves the generated plot to the file path specified in 'plot_path'."], "note": [], "params": ["file_path (str): Path to the CSV input file.", "plot_path (str): Path where the plot will be saved."], "returns": ["tuple: A tuple containing the following elements:", "Mean (float): The average value of the data. Returns NaN if data is empty or non-numeric.", "Median (float): The middle value of the data when sorted. Returns NaN if data is empty or non-numeric.", "Plot Path (str): The path where the plot is saved."], "reqs": ["os", "pandas", "matplotlib", "numpy"], "raises": ["FileNotFoundError: If the CSV file at 'file_path' does not exist."], "example": [">>> f_838(\"sample_data.csv\", \"output_plot.png\")", "(25.5, 23.0, \"output_plot.png\")"]}} -{"task_id": "f_366", "prompt": "import matplotlib.pyplot as plt\nimport numpy as np\n\n\ndef f_366(n, seed=0):\n \"\"\"\n Generates a simple scatter plot with 'n' points.\n\n Parameters:\n - n (int): The number of points to be plotted.\n - seed (int, optional): The seed for the random number generator. Defaults to None.\n\n Returns:\n - plot (matplotlib.figure.Figure): The generated plot titled \"Scatter plot of random points\".\n - points (list of tuples): List containing the (x, y) coordinates of the plotted points.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> f_366(5)\n (
, [(0.5488135039273248, 0.6458941130666561), (0.7151893663724195, 0.4375872112626925), (0.6027633760716439, 0.8917730007820798), (0.5448831829968969, 0.9636627605010293), (0.4236547993389047, 0.3834415188257777)])\n \"\"\"", "canonical_solution": " # Setting the random seed for reproducibility\n np.random.seed(seed)\n\n # Generating random points\n x = np.random.rand(n)\n y = np.random.rand(n)\n\n # Plotting\n fig, ax = plt.subplots()\n ax.scatter(x, y)\n ax.set_title(\"Scatter plot of random points\")\n ax.set_xlabel(\"X\")\n ax.set_ylabel(\"Y\")\n\n return fig, list(zip(x, y))", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic point type and structure\n _, points = f_366(5)\n self.assertTrue(\n all(\n isinstance(point, tuple)\n and len(point) == 2\n and all(isinstance(coord, float) for coord in point)\n for point in points\n ),\n \"Points should be a list of tuples with float coordinates\",\n )\n def test_case_2(self):\n # Test parameter 'n'\n for n in [0, 1, 5, 100]:\n plot, points = f_366(n)\n self.assertEqual(len(points), n)\n self.assertTrue(isinstance(plot, type(plt.figure())))\n def test_case_3(self):\n # Test random seed - reproduction\n _, points1 = f_366(5, seed=1)\n _, points2 = f_366(5, seed=1)\n self.assertEqual(\n points1, points2, \"Points generated with the same seed should match exactly\"\n )\n def test_case_4(self):\n # Test random seed - differences\n _, points1 = f_366(5, seed=1)\n _, points2 = f_366(5, seed=10)\n self.assertNotEqual(\n points1, points2, \"Points generated with the same seed should match exactly\"\n )\n def test_case_5(self):\n # Test invalid inputs\n with self.assertRaises(ValueError):\n f_366(-5)\n with self.assertRaises(TypeError):\n f_366(5.5)\n with self.assertRaises(TypeError):\n f_366(\"5\")\n def test_case_6(self):\n # Test visualization\n fig, _ = f_366(1)\n ax = fig.axes[0]\n self.assertEqual(ax.get_title(), \"Scatter plot of random points\")\n self.assertEqual(ax.get_xlabel(), \"X\")\n self.assertEqual(ax.get_ylabel(), \"Y\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.random", "numpy.random.rand", "numpy.random.seed", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Generates a simple scatter plot with 'n' points."], "note": [], "params": ["n (int): The number of points to be plotted.", "seed (int, optional): The seed for the random number generator. Defaults to None."], "returns": ["plot (matplotlib.figure.Figure): The generated plot titled \"Scatter plot of random points\".", "points (list of tuples): List containing the (x, y) coordinates of the plotted points."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> f_366(5)", "(
, [(0.5488135039273248, 0.6458941130666561), (0.7151893663724195, 0.4375872112626925), (0.6027633760716439, 0.8917730007820798), (0.5448831829968969, 0.9636627605010293), (0.4236547993389047, 0.3834415188257777)])"]}} -{"task_id": "f_899", "prompt": "import pandas as pd\nimport itertools\nimport random\n\n\ndef f_899(colors, states):\n \"\"\"\n Generates a pandas DataFrame containing shuffled combinations of provided colors and states.\n The DataFrame is formatted so that each column represents a series of unique combinations,\n with each combination displayed as \"Color:State\".\n\n Parameters:\n - colors (list): A list of strings representing color names.\n - states (list): A list of strings representing state descriptions.\n\n Returns:\n - df (pandas.DataFrame): A DataFrame where each cell contains a string of the format \"Color:State\".\n The combinations are distributed across columns, with the number of columns being the lesser\n of the lengths of 'colors' and 'states'.\n\n Requirements:\n - pandas\n - itertools\n - random\n\n Note:\n - Cartesian product of 'colors' and 'states',\n - The number of columns in the resulting DataFrame is determined by the smaller number of elements\n in either the 'colors' or 'states' list, ensuring an even distribution without excess empty cells.\n - If the number of combinations is not evenly divisible by the number of columns, some columns\n will have fewer entries.\n\n Example:\n >>> colors = ['Red', 'Blue', 'Green']\n >>> states = ['Solid', 'Liquid']\n >>> color_state_table = f_899(colors, states)\n >>> print(color_state_table)\n Color:State 1 Color:State 2\n 0 Blue:Liquid Red:Liquid\n 1 Blue:Solid Green:Solid\n 2 Red:Solid Green:Liquid\n \"\"\"", "canonical_solution": " combinations = list(itertools.product(colors, states))\n random.seed(42)\n random.shuffle(combinations)\n num_columns = min(len(colors), len(states))\n\n data = {\n f\"Color:State {i+1}\": [\n f\"{comb[0]}:{comb[1]}\" for comb in combinations[i::num_columns]\n ]\n for i in range(num_columns)\n }\n df = pd.DataFrame(data)\n\n return df", "test": "import unittest\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_899.\"\"\"\n def test_empty_lists(self):\n \"\"\"Test with empty color and state lists.\"\"\"\n self.assertEqual(f_899([], []).empty, True)\n def test_single_color_and_state(self):\n \"\"\"Test with one color and one state.\"\"\"\n random.seed(0)\n result = f_899([\"Red\"], [\"Solid\"])\n expected = pd.DataFrame({\"Color:State 1\": [\"Red:Solid\"]})\n pd.testing.assert_frame_equal(result, expected)\n def test_multiple_colors_single_state(self):\n \"\"\"Test with multiple colors and a single state.\"\"\"\n random.seed(1)\n result = f_899([\"Red\", \"Blue\", \"Green\"], [\"Solid\"])\n expected_combinations = set([\"Red:Solid\", \"Blue:Solid\", \"Green:Solid\"])\n result_combinations = set(result[\"Color:State 1\"])\n self.assertEqual(result_combinations, expected_combinations)\n def test_single_color_multiple_states(self):\n \"\"\"Test with a single color and multiple states.\"\"\"\n random.seed(2)\n result = f_899([\"Red\"], [\"Solid\", \"Liquid\", \"Gas\"])\n expected_combinations = set([\"Red:Solid\", \"Red:Liquid\", \"Red:Gas\"])\n result_combinations = set(result[\"Color:State 1\"])\n self.assertEqual(result_combinations, expected_combinations)\n def test_multiple_colors_and_states(self):\n \"\"\"Test with multiple colors and states.\"\"\"\n random.seed(3)\n colors = [\"Red\", \"Blue\"]\n states = [\"Solid\", \"Liquid\"]\n result = f_899(colors, states)\n expected_combinations = set(\n [f\"{color}:{state}\" for color in colors for state in states]\n )\n result_combinations = set(result.values.flatten())\n self.assertEqual(result_combinations, expected_combinations)", "apis": ["random.shuffle", "pandas.DataFrame", "random.seed", "itertools.product"], "libs": ["pandas", "random", "itertools"], "doc": {"description": ["Generates a pandas DataFrame containing shuffled combinations of provided colors and states.", "The DataFrame is formatted so that each column represents a series of unique combinations,", "with each combination displayed as \"Color:State\"."], "note": ["Cartesian product of 'colors' and 'states',", "The number of columns in the resulting DataFrame is determined by the smaller number of elements", "in either the 'colors' or 'states' list, ensuring an even distribution without excess empty cells.", "If the number of combinations is not evenly divisible by the number of columns, some columns", "will have fewer entries."], "params": ["colors (list): A list of strings representing color names.", "states (list): A list of strings representing state descriptions."], "returns": ["df (pandas.DataFrame): A DataFrame where each cell contains a string of the format \"Color:State\".", "The combinations are distributed across columns, with the number of columns being the lesser", "of the lengths of 'colors' and 'states'."], "reqs": ["pandas", "itertools", "random"], "raises": [], "example": [">>> colors = ['Red', 'Blue', 'Green']", ">>> states = ['Solid', 'Liquid']", ">>> color_state_table = f_899(colors, states)", ">>> print(color_state_table)", "Color:State 1 Color:State 2", "0 Blue:Liquid Red:Liquid", "1 Blue:Solid Green:Solid", "2 Red:Solid Green:Liquid"]}} -{"task_id": "f_416", "prompt": "import csv\nfrom collections import Counter\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_416(file_path):\n \"\"\"\n Identifies duplicate rows from a CSV file using the csv library, convert duplicated rows\n into a pandas DataFrame, then plot using matplotlib.\n\n Parameters:\n - file_path (str): The path to the CSV file.\n\n Returns:\n - dict: A dictionary with duplicate rows as keys and their counts as values.\n - Axes: A matplotlib Axes object with the bar chart of duplicate rows.\n\n Requirements:\n - csv\n - collections.Counter\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> duplicates, ax = f_416(\"sample_data.csv\")\n >>> duplicates\n {('Alice', '25', 'New York'): 3, ('Bob', '30', 'London'): 2}\n >>> type(ax)\n \n\n Note: Ensure the CSV file is in proper format and has a .csv extension. Other file formats will raise a ValueError.\n \"\"\"", "canonical_solution": " # Strip the file_path and then check its extension\n file_path = file_path.strip()\n if not file_path.lower().endswith(\".csv\"):\n raise ValueError(\"Invalid file format. Only .csv files are accepted.\")\n\n # Read the CSV file\n with open(file_path, \"r\") as f:\n reader = csv.reader(f)\n rows = list(reader)\n\n # Use Counter to get duplicates\n duplicates = Counter(tuple(row) for row in rows if rows.count(row) > 1)\n\n # Plot the duplicates using matplotlib\n ax = None\n if duplicates:\n df = pd.DataFrame(duplicates.values(), duplicates.keys())\n ax = df.plot(kind=\"bar\", legend=False, title=\"Duplicate Entries\")\n ax.set_ylabel(\"Count\")\n plt.tight_layout()\n\n return duplicates, ax", "test": "import unittest\nimport tempfile\nimport os\nimport matplotlib\nfrom collections import Counter\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.addCleanup(self.temp_dir.cleanup)\n def tearDown(self):\n plt.close(\"all\")\n def create_temp_csv_file(self, content):\n # Create a temporary CSV file within the temp directory\n temp_file_path = os.path.join(self.temp_dir.name, \"temp_file.csv\")\n with open(temp_file_path, \"w\", newline=\"\") as temp_file:\n temp_file.write(content)\n return temp_file_path\n def test_case_1(self):\n # With duplicates - test results\n content = \"Name,Age,City\\nAlice,25,New York\\nAlice,25,New York\\nBob,30,London\\nAlice,25,New York\\nBob,30,London\"\n file_path = self.create_temp_csv_file(content)\n duplicates, _ = f_416(file_path)\n self.assertEqual(\n duplicates,\n Counter({(\"Alice\", \"25\", \"New York\"): 3, (\"Bob\", \"30\", \"London\"): 2}),\n )\n def test_case_2(self):\n # With duplicates - test plot\n content = \"Name,Age,City\\nAlice,25,New York\\nAlice,25,New York\\nBob,30,London\\nAlice,25,New York\\nBob,30,London\"\n file_path = self.create_temp_csv_file(content)\n _, ax = f_416(file_path)\n # Test plot\n self.assertIsNotNone(ax)\n self.assertIsInstance(ax, matplotlib.axes._axes.Axes)\n self.assertEqual(ax.get_title(), \"Duplicate Entries\")\n self.assertEqual(ax.get_ylabel(), \"Count\")\n def test_case_3(self):\n # Without duplicates\n content = \"Name,Age,City\\nEve,28,Paris\\nAdam,32,Berlin\"\n file_path = self.create_temp_csv_file(content)\n duplicates, ax = f_416(file_path)\n self.assertEqual(duplicates, Counter())\n self.assertIsNone(ax)\n def test_case_4(self):\n with self.assertRaises(ValueError):\n f_416(\"sample_data.txt\")\n def test_case_5(self):\n with self.assertRaises(FileNotFoundError):\n f_416(os.path.join(self.temp_dir.name, \"non_existent_file.csv\"))", "apis": ["matplotlib.pyplot.tight_layout", "pandas.DataFrame", "collections.Counter", "csv.reader"], "libs": ["matplotlib", "pandas", "csv", "collections"], "doc": {"description": ["Identifies duplicate rows from a CSV file using the csv library, convert duplicated rows", "into a pandas DataFrame, then plot using matplotlib."], "note": ["Ensure the CSV file is in proper format and has a .csv extension. Other file formats will raise a ValueError."], "params": ["file_path (str): The path to the CSV file."], "returns": ["dict: A dictionary with duplicate rows as keys and their counts as values.", "Axes: A matplotlib Axes object with the bar chart of duplicate rows."], "reqs": ["csv", "collections.Counter", "pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> duplicates, ax = f_416(\"sample_data.csv\")", ">>> duplicates", "{('Alice', '25', 'New York'): 3, ('Bob', '30', 'London'): 2}", ">>> type(ax)", ""]}} -{"task_id": "f_372", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport itertools\n\ndef f_372(n_walks, n_steps, seed=None):\n \"\"\"\n Create and plot `n_walks` number of random walks, each with `n_steps` steps.\n\n The function checks for valid n_walks and n_steps, then generates walks via numpy.\n Each walk is plotted in a different color cycling through a predefined set of colors:\n ['b', 'g', 'r', 'c', 'm', 'y', 'k'].\n\n Parameters:\n - n_walks (int): The number of random walks to be generated and plotted.\n - n_steps (int): The number of steps in each random walk.\n - seed (int, optional): Seed for random number generation. Default is None.\n\n Returns:\n - ax (plt.Axes): A Matplotlib Axes containing the plotted random walks.\n\n Requirements:\n - numpy\n - matplotlib\n - itertools\n\n Example:\n >>> ax = f_372(5, 100, seed=42)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-20.0, 0, '\u221220'), Text(0.0, 0, '0'), Text(20.0, 0, '20'), Text(40.0, 0, '40'), Text(60.0, 0, '60'), Text(80.0, 0, '80'), Text(100.0, 0, '100'), Text(120.0, 0, '120')]\n \"\"\"", "canonical_solution": " if n_walks < 0 or n_steps < 0:\n raise ValueError(\"Walks and steps cannot be negative.\")\n np.random.seed(seed)\n COLORS = [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"]\n color_cycle = itertools.cycle(COLORS)\n fig, ax = plt.subplots()\n for _ in range(n_walks):\n walk = np.random.choice([-1, 1], size=n_steps)\n walk = np.cumsum(walk)\n ax.plot(walk, next(color_cycle))\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic setup\n ax = f_372(5, 100, seed=42)\n self.assertIsInstance(ax, plt.Axes)\n def test_case_2(self):\n # Test number of walks\n for n_walk in [0, 1, 2, 10, 50]:\n ax = f_372(n_walk, 10, seed=42)\n lines = ax.get_lines()\n self.assertEqual(len(lines), n_walk)\n def test_case_3(self):\n # Test number of steps\n for n_steps in [0, 1, 10, 100, 500]:\n ax = f_372(2, n_steps, seed=42)\n lines = ax.get_lines()\n self.assertEqual(len(lines[0].get_ydata()), n_steps)\n def test_case_4(self):\n # Test random seed\n ax1 = f_372(5, 100, seed=42)\n ax2 = f_372(5, 100, seed=42)\n ax3 = f_372(5, 100, seed=0)\n lines1 = ax1.get_lines()\n lines2 = ax2.get_lines()\n lines3 = ax3.get_lines()\n self.assertTrue(\n all(\n np.array_equal(line1.get_ydata(), line2.get_ydata())\n for line1, line2 in zip(lines1, lines2)\n )\n )\n self.assertFalse(\n all(\n np.array_equal(line1.get_ydata(), line3.get_ydata())\n for line1, line3 in zip(lines1, lines3)\n ),\n \"Random walks are not reproducible using the same seed.\",\n )\n def test_case_5(self):\n # Test invalid n_walks\n with self.assertRaises(ValueError):\n f_372(-1, 100, seed=42)\n def test_case_6(self):\n # Test negative n_steps\n with self.assertRaises(ValueError):\n f_372(1, -100, seed=42)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.cumsum", "numpy.random", "itertools.cycle", "numpy.random.seed", "numpy.random.choice", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "itertools"], "doc": {"description": ["Create and plot `n_walks` number of random walks, each with `n_steps` steps.", "The function checks for valid n_walks and n_steps, then generates walks via numpy.", "Each walk is plotted in a different color cycling through a predefined set of colors:", "['b', 'g', 'r', 'c', 'm', 'y', 'k']."], "note": [], "params": ["n_walks (int): The number of random walks to be generated and plotted.", "n_steps (int): The number of steps in each random walk.", "seed (int, optional): Seed for random number generation. Default is None."], "returns": ["ax (plt.Axes): A Matplotlib Axes containing the plotted random walks."], "reqs": ["numpy", "matplotlib", "itertools"], "raises": [], "example": [">>> ax = f_372(5, 100, seed=42)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-20.0, 0, '\u221220'), Text(0.0, 0, '0'), Text(20.0, 0, '20'), Text(40.0, 0, '40'), Text(60.0, 0, '60'), Text(80.0, 0, '80'), Text(100.0, 0, '100'), Text(120.0, 0, '120')]"]}} -{"task_id": "f_392", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_392(days, random_seed=0):\n \"\"\"\n Generates a spending report DataFrame for the given number of days.\n\n This function takes a number of days as input and populates a pandas DataFrame\n with fake expenditure data indexed by date. Each day on or after '2023-01-01'\n has its own row. The DataFrame has five columns: Groceries, Entertainment, Rent,\n Utilities, and Miscellaneous, with their integer values independently randomly\n sampled from 0 to 100.\n\n Parameters:\n - days (int): Number of days for which the report is to be generated.\n This is used to generate dates starting from '2023-01-01'.\n For example, a 'days' of 2 will generate data for '2023-01-01',\n '2023-01-02'.\n If 0, this function will return a DataFrame with the expected\n columns that is otherwise empty.\n - random_seed (int): Numpy random seed for reproducibility. Defaults to 0.\n\n Returns:\n - pd.DataFrame: A DataFrame containing spending details for specified days,\n with shape (num_days, 5).\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> df = f_392(5, random_seed=42)\n >>> type(df)\n \n >>> df.head(2)\n Groceries Entertainment Rent Utilities Miscellaneous\n date \n 2023-01-01 51 20 87 52 1\n 2023-01-02 92 82 99 1 63\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n date_rng = pd.date_range(start=\"2023-01-01\", periods=days, freq=\"D\")\n df = pd.DataFrame(date_rng, columns=[\"date\"])\n df.set_index(\"date\", inplace=True)\n categories = [\"Groceries\", \"Entertainment\", \"Rent\", \"Utilities\", \"Miscellaneous\"]\n for category in categories:\n df[category] = np.random.randint(0, 100, size=(days))\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n report_columns = [\n \"Groceries\",\n \"Entertainment\",\n \"Rent\",\n \"Utilities\",\n \"Miscellaneous\",\n ]\n start_date = pd.to_datetime([\"2023-01-01\"]).day\n def _test_report_structure(self, report, days):\n self.assertIsInstance(report, pd.DataFrame)\n self.assertEqual(report.shape[0], days)\n self.assertEqual(report.shape[1], len(self.report_columns))\n self.assertEqual(list(report.columns), self.report_columns)\n def _test_report_data(self, report):\n self.assertFalse(report.isnull().values.any())\n self.assertTrue(pd.api.types.is_datetime64_ns_dtype(report.index))\n self.assertTrue(report.index.day.map(lambda d: d >= self.start_date).all())\n for col in report:\n self.assertTrue((report[col] >= 0).all() and (report[col] <= 100).all())\n def _test_report(self, report, days):\n self._test_report_structure(report, days)\n self._test_report_data(report)\n def test_case_1(self):\n # Test basic case with default parameters\n days = 7\n report = f_392(days)\n self._test_report(report, days)\n def test_case_2(self):\n # Test handling 0 days\n days = 0\n report = f_392(days)\n self._test_report(report, days)\n def test_case_3(self):\n # Test handling larger number of days\n days = 1000\n report = f_392(days)\n self._test_report(report, days)\n def test_case_4(self):\n # Test handling invalid inputs\n with self.assertRaises(ValueError):\n f_392(-1)\n with self.assertRaises(ValueError):\n f_392(None)\n with self.assertRaises(TypeError):\n f_392(\"-1\")\n def test_case_5(self):\n # Test random seed reproducibility\n days = 100\n report1 = f_392(days, random_seed=42)\n report2 = f_392(days, random_seed=42)\n self.assertTrue(report1.equals(report2))\n self._test_report(report1, days)\n self._test_report(report2, days)\n def test_case_6(self):\n # Test random seed variation\n days = 100\n report1 = f_392(days, random_seed=24)\n report2 = f_392(days, random_seed=42)\n self.assertFalse(report1.equals(report2))\n self._test_report(report1, days)\n self._test_report(report2, days)", "apis": ["numpy.random.randint", "numpy.random", "numpy.random.seed", "pandas.date_range", "pandas.DataFrame"], "libs": ["numpy", "pandas"], "doc": {"description": ["Generates a spending report DataFrame for the given number of days.", "This function takes a number of days as input and populates a pandas DataFrame", "with fake expenditure data indexed by date. Each day on or after '2023-01-01'", "has its own row. The DataFrame has five columns: Groceries, Entertainment, Rent,", "Utilities, and Miscellaneous, with their integer values independently randomly", "sampled from 0 to 100."], "note": [], "params": ["days (int): Number of days for which the report is to be generated.", "This is used to generate dates starting from '2023-01-01'.", "For example, a 'days' of 2 will generate data for '2023-01-01',", "'2023-01-02'.", "If 0, this function will return a DataFrame with the expected", "columns that is otherwise empty.", "random_seed (int): Numpy random seed for reproducibility. Defaults to 0."], "returns": ["pd.DataFrame: A DataFrame containing spending details for specified days,", "with shape (num_days, 5)."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> df = f_392(5, random_seed=42)", ">>> type(df)", "", ">>> df.head(2)", "Groceries Entertainment Rent Utilities Miscellaneous", "date", "2023-01-01 51 20 87 52 1", "2023-01-02 92 82 99 1 63"]}} +{"task_id": "f_334", "prompt": "import pandas as pd\nfrom sklearn.linear_model import LinearRegression\nimport matplotlib.pyplot as plt\n\n\ndef f_334(df1, df2, features=[\"feature1\", \"feature2\", \"feature3\"], target=\"target\"):\n \"\"\"\n Perform linear regression analysis with specified characteristics and targets.\n The function should merge two dataframes based on the 'id' column, perform\n linear regression using columns specified in features to predict the target,\n and plot the residuals.\n\n Parameters:\n - df1 (DataFrame): The first dataframe containing columns 'id' and the features specified.\n - df2 (DataFrame): The second dataframe containing columns 'id' and target.\n - features (list of str, optional): List of feature column names. Default is ['feature1', 'feature2', 'feature3'].\n - target (str, optional): Name of the target column. Default is 'target'.\n\n Returns:\n dict: A dictionary containing:\n - 'coefficients': Regression coefficients (list).\n - 'intercept': Regression intercept (float).\n - 'residuals_plot': A matplotlib Axes object representing the residuals plot.\n\n Requirements:\n - pandas\n - sklearn.linear_model.LinearRegression\n - matplotlib.pyplot\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7], 'feature3': [3.4, 5.6, 7.8]})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'target': [4.5, 6.7, 8.9]})\n >>> result = f_334(df1, df2)\n >>> result['coefficients']\n [0.3333333333333334, 0.33333333333333354, 0.3333333333333335]\n >>> type(result['residuals_plot'])\n \n \"\"\"", "canonical_solution": " df = pd.merge(df1, df2, on=\"id\")\n X = df[features]\n y = df[target]\n model = LinearRegression()\n model.fit(X, y)\n y_pred = model.predict(X)\n residuals = y - y_pred\n fig, ax = plt.subplots()\n ax.scatter(y_pred, residuals) # scatter plot of residuals\n ax.axhline(y=0, color=\"r\", linestyle=\"-\") # horizontal line at y=0\n ax.set_xlabel(\"Predicted Values\")\n ax.set_ylabel(\"Residuals\")\n ax.set_title(\"Residuals Plot\")\n return {\n \"coefficients\": list(model.coef_),\n \"intercept\": model.intercept_,\n \"residuals_plot\": ax,\n }", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\nclass TestCases(unittest.TestCase):\n # Setting up sample data for some test cases\n def setUp(self):\n self.df1_sample = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, 3],\n \"feature2\": [1, 2, 3],\n \"feature3\": [1, 2, 3],\n }\n )\n self.df2_sample = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [6, 15, 24]})\n def tearDown(self):\n plt.close(\"all\")\n # Test if the function returns the correct coefficients and intercept\n def test_case_1(self):\n result = f_334(self.df1_sample, self.df2_sample)\n for coef_actual, coef_expected in zip(result[\"coefficients\"], [3.0, 3.0, 3.0]):\n self.assertAlmostEqual(coef_actual, coef_expected, places=7)\n self.assertAlmostEqual(result[\"intercept\"], -3.0, places=7)\n # Test if the function returns the residuals plot\n def test_case_2(self):\n result = f_334(self.df1_sample, self.df2_sample)\n self.assertTrue(isinstance(result[\"residuals_plot\"], plt.Axes))\n # Test if the residuals plot contains the right number of data points\n def test_case_3(self):\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [2, 4, 6],\n \"feature2\": [2, 4, 6],\n \"feature3\": [2, 4, 6],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [12, 30, 48]})\n result = f_334(df1, df2)\n self.assertEqual(len(result[\"residuals_plot\"].collections), 1)\n # Test if the intercept of the model is correct\n def test_case_4(self):\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, 3],\n \"feature2\": [4, 5, 6],\n \"feature3\": [7, 8, 9],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [10, 11, 12]})\n result = f_334(df1, df2)\n self.assertAlmostEqual(result[\"intercept\"], 6.0, places=7)\n # Test the coefficients and intercept for a different set of data\n def test_case_5(self):\n result = f_334(self.df1_sample, self.df2_sample)\n for coef_actual, coef_expected in zip(result[\"coefficients\"], [3.0, 3.0, 3.0]):\n self.assertAlmostEqual(coef_actual, coef_expected, places=7)\n self.assertAlmostEqual(result[\"intercept\"], -3.0, places=7)\n # Test the coefficients and intercept against sklearn's LinearRegression for verification\n def test_case_6(self):\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n \"feature1\": list(range(10)),\n \"feature2\": list(range(10, 20)),\n \"feature3\": list(range(20, 30)),\n }\n )\n df2 = pd.DataFrame(\n {\"id\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], \"target\": list(range(30, 40))}\n )\n result = f_334(df1, df2)\n model = LinearRegression().fit(\n df1[[\"feature1\", \"feature2\", \"feature3\"]], df2[\"target\"]\n )\n expected_coefficients = model.coef_\n expected_intercept = model.intercept_\n self.assertListEqual(result[\"coefficients\"], list(expected_coefficients))\n self.assertEqual(result[\"intercept\"], expected_intercept)\n # Test the residuals plot's title and grid properties\n def test_case_7(self):\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1, 2, 3],\n \"feature2\": [4, 5, 6],\n \"feature3\": [7, 8, 9],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [10, 11, 12]})\n result = f_334(df1, df2)\n self.assertEqual(result[\"residuals_plot\"].get_title(), \"Residuals Plot\")\n self.assertTrue(result[\"residuals_plot\"].grid)\n self.assertEqual(len(result[\"residuals_plot\"].lines), 1)", "apis": ["matplotlib.pyplot.subplots", "pandas.merge", "sklearn.linear_model.LinearRegression"], "libs": ["sklearn", "pandas", "matplotlib"], "doc": {"description": ["Perform linear regression analysis with specified characteristics and targets.", "The function should merge two dataframes based on the 'id' column, perform", "linear regression using columns specified in features to predict the target,", "and plot the residuals."], "note": [], "params": ["df1 (DataFrame): The first dataframe containing columns 'id' and the features specified.", "df2 (DataFrame): The second dataframe containing columns 'id' and target.", "features (list of str, optional): List of feature column names. Default is ['feature1', 'feature2', 'feature3'].", "target (str, optional): Name of the target column. Default is 'target'."], "returns": ["dict: A dictionary containing:", "'coefficients': Regression coefficients (list).", "'intercept': Regression intercept (float).", "'residuals_plot': A matplotlib Axes object representing the residuals plot."], "reqs": ["pandas", "sklearn.linear_model.LinearRegression", "matplotlib.pyplot"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7], 'feature3': [3.4, 5.6, 7.8]})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'target': [4.5, 6.7, 8.9]})", ">>> result = f_334(df1, df2)", ">>> result['coefficients']", "[0.3333333333333334, 0.33333333333333354, 0.3333333333333335]", ">>> type(result['residuals_plot'])", ""]}} +{"task_id": "f_752", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\ndef f_752(letters, repetitions, colors):\n \"\"\"\n Create a bar chart to visualize the frequency of each letter in a flattened list \n formed by multiple repetitions of the original list. Each repetition of the list \n is associated with a different color in the chart.\n \n Note:\n - Generate a bar chart for the frequency of letters, where each letter's frequency\n is determined by its number of repetitions.\n - Each letter's bar in the chart is colored according to the specified color.\n - The length of the list `colors` should match the number of repetitions of `letters`.\n - The lists 'letters' and 'colors' cannot be empty.\n \n Input:\n - letters (list of str): A list of unique letters to be visualized.\n - repetitions (list of int): A list of the number of times each letter is repeated.\n Must be the same length as `letters`.\n - colors (list of str): A list of colors for the bars corresponding to each letter.\n Must be the same length as `letters`.\n \n Output:\n - Returns the Matplotlib Axes object representing the created bar chart.\n \n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue'])\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if len(letters) != len(repetitions) or len(letters) != len(colors) or len(letters) == 0:\n raise ValueError(\"All lists must be the same length and non-empty.\")\n \n # Count the frequency of each letter based on repetitions\n counts = np.array(repetitions)\n \n # Create the bar chart\n fig, ax = plt.subplots()\n ax.bar(letters, counts, color=colors)\n ax.set_xlabel('Letters')\n ax.set_ylabel('Frequency')\n ax.set_title('Frequency of Letters')\n \n return ax", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_basic_input(self):\n ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue'])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Frequency of Letters\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n expected_colors = ['red', 'green', 'blue']\n for patch, expected_color in zip(ax.patches, expected_colors):\n self.assertEqual(patch.get_facecolor(), plt.cm.colors.to_rgba(expected_color))\n expected_counts = [3, 5, 2]\n for patch, expected_count in zip(ax.patches, expected_counts):\n self.assertEqual(patch.get_height(), expected_count)\n \n def test_invalid_input_length(self):\n with self.assertRaises(ValueError):\n f_752(['A', 'B'], [3], ['red', 'green'])\n \n def test_empty_lists(self):\n with self.assertRaises(ValueError):\n f_752([], [], [])\n \n def test_single_letter(self):\n ax = f_752(['Z'], [1], ['purple'])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Frequency of Letters\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n self.assertEqual(ax.patches[0].get_facecolor(), plt.cm.colors.to_rgba('purple'))\n self.assertEqual(ax.patches[0].get_height(), 1)\n \n def test_multiple_repetitions(self):\n ax = f_752(['D', 'E', 'F'], [10, 20, 15], ['cyan', 'magenta', 'yellow'])\n self.assertIsInstance(ax, plt.Axes)\n expected_counts = [10, 20, 15]\n for patch, expected_count in zip(ax.patches, expected_counts):\n self.assertEqual(patch.get_height(), expected_count)", "apis": ["matplotlib.pyplot.subplots", "numpy.array"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Create a bar chart to visualize the frequency of each letter in a flattened list", "formed by multiple repetitions of the original list. Each repetition of the list", "is associated with a different color in the chart.", "Input:", "- letters (list of str): A list of unique letters to be visualized.", "- repetitions (list of int): A list of the number of times each letter is repeated.", "Must be the same length as `letters`.", "- colors (list of str): A list of colors for the bars corresponding to each letter.", "Must be the same length as `letters`.", "Output:", "- Returns the Matplotlib Axes object representing the created bar chart."], "note": ["Generate a bar chart for the frequency of letters, where each letter's frequency", "is determined by its number of repetitions.", "Each letter's bar in the chart is colored according to the specified color.", "The length of the list `colors` should match the number of repetitions of `letters`.", "The lists 'letters' and 'colors' cannot be empty."], "params": [], "returns": [], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue'])", ">>> type(ax)", ""]}} +{"task_id": "f_838", "prompt": "import os\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n\ndef f_838(file_path: str, plot_path: str) -> (float, float, str):\n \"\"\"\n Processes a CSV file at the given path by reading its contents, cleaning the data,\n performing statistical analysis, and generating a plot, which is saved to the specified path.\n\n Sets the title of the plot to \"Data Visualization\".\n Labels the x-axis as \"Index\" and the y-axis as \"Value\".\n Saves the generated plot to the file path specified in 'plot_path'.\n\n Parameters:\n - file_path (str): Path to the CSV input file.\n - plot_path (str): Path where the plot will be saved.\n\n Returns:\n - tuple: A tuple containing the following elements:\n - Mean (float): The average value of the data. Returns NaN if data is empty or non-numeric.\n - Median (float): The middle value of the data when sorted. Returns NaN if data is empty or non-numeric.\n - Plot Path (str): The path where the plot is saved.\n\n Raises:\n - FileNotFoundError: If the CSV file at 'file_path' does not exist.\n\n Requirements:\n - os\n - pandas\n - matplotlib\n - numpy\n\n Example:\n >>> f_838(\"sample_data.csv\", \"output_plot.png\")\n (25.5, 23.0, \"output_plot.png\")\n \"\"\"", "canonical_solution": " # Check if file exists\n if not os.path.isfile(file_path):\n raise FileNotFoundError(f\"File {file_path} does not exist.\")\n\n # Load data and handle empty file\n try:\n data = pd.read_csv(file_path)\n except pd.errors.EmptyDataError:\n return np.nan, np.nan, plot_path\n\n # Convert data to numeric, coerce errors to NaN\n data = pd.to_numeric(data.squeeze(), errors=\"coerce\")\n\n # Ensure data is a Pandas Series\n if not isinstance(data, pd.Series):\n data = pd.Series(data)\n\n # Clean data\n data = data.dropna()\n\n # Perform analysis\n if data.empty:\n mean = median = np.nan\n else:\n # Calculate mean and median\n mean = float(np.mean(data))\n median = float(np.median(data))\n\n # Create plot and save it\n plt.figure(figsize=(10, 6))\n plt.plot(data)\n plt.title(\"Data Visualization\")\n plt.xlabel(\"Index\")\n plt.ylabel(\"Value\")\n plt.savefig(plot_path)\n plt.close()\n\n return mean, median, plot_path", "test": "import unittest\nimport os\nimport numpy as np\nimport pandas as pd\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_838 function.\"\"\"\n def setUp(self):\n # Create a directory for test files if it doesn't exist\n self.test_dir = \"mnt/data/f_838_data_test\"\n os.makedirs(self.test_dir, exist_ok=True)\n # Create a valid data file\n self.valid_data_path = os.path.join(self.test_dir, \"valid_data.csv\")\n pd.DataFrame({\"data\": np.random.rand(100)}).to_csv(\n self.valid_data_path, index=False\n )\n # Create an empty data file\n self.empty_data_path = os.path.join(self.test_dir, \"empty_data.csv\")\n with open(self.empty_data_path, \"w\") as f:\n f.write(\"\")\n # Create a non-numeric data file\n self.non_numeric_data_path = os.path.join(self.test_dir, \"non_numeric_data.csv\")\n pd.DataFrame({\"data\": [\"a\", \"b\", \"c\", \"d\"]}).to_csv(\n self.non_numeric_data_path, index=False\n )\n # Create a large data file\n self.large_data_path = os.path.join(self.test_dir, \"large_data.csv\")\n pd.DataFrame({\"data\": np.random.rand(10000)}).to_csv(\n self.large_data_path, index=False\n )\n # Create a data file with NaN values\n self.nan_data_path = os.path.join(self.test_dir, \"nan_data.csv\")\n pd.DataFrame({\"data\": [1, np.nan, 2, np.nan, 3]}).to_csv(\n self.nan_data_path, index=False\n )\n # Create a data file with a single value\n self.single_value_path = os.path.join(self.test_dir, \"single_value.csv\")\n pd.DataFrame({\"data\": [42]}).to_csv(self.single_value_path, index=False)\n # Create a data file where all values are NaN\n self.all_nan_path = os.path.join(self.test_dir, \"all_nan.csv\")\n pd.DataFrame({\"data\": [np.nan, np.nan, np.nan]}).to_csv(\n self.all_nan_path, index=False\n )\n def test_valid_input(self):\n \"\"\"Test that the function runs without errors and returns the correct output.\"\"\"\n plot_path = os.path.join(self.test_dir, \"valid_plot.png\")\n mean, median, plot_path = f_838(self.valid_data_path, plot_path)\n self.assertIsInstance(mean, float)\n self.assertIsInstance(median, float)\n self.assertTrue(os.path.exists(plot_path))\n def test_file_not_found(self):\n \"\"\"Test that the function raises a FileNotFoundError when the specified file does not exist.\"\"\"\n plot_path = os.path.join(self.test_dir, \"not_found_plot.png\")\n with self.assertRaises(FileNotFoundError):\n f_838(os.path.join(self.test_dir, \"non_existent_file.csv\"), plot_path)\n def test_empty_file(self):\n \"\"\"Test that the function returns NaN for mean and median when the file is empty.\"\"\"\n plot_path = os.path.join(self.test_dir, \"empty_plot.png\")\n mean, median, returned_plot_path = f_838(self.empty_data_path, plot_path)\n self.assertTrue(np.isnan(mean))\n self.assertTrue(np.isnan(median))\n self.assertFalse(\n os.path.exists(returned_plot_path)\n ) # Plot should not exist for empty file\n def test_non_numeric_data(self):\n \"\"\"Test that the function returns NaN for mean and median when the file contains non-numeric data.\"\"\"\n plot_path = os.path.join(self.test_dir, \"non_numeric_plot.png\")\n mean, median, returned_plot_path = f_838(self.non_numeric_data_path, plot_path)\n self.assertTrue(np.isnan(mean))\n self.assertTrue(np.isnan(median))\n self.assertTrue(os.path.exists(returned_plot_path))\n def test_large_data(self):\n \"\"\"Test that the function runs without errors and returns the correct output for a large data file.\"\"\"\n plot_path = os.path.join(self.test_dir, \"large_data_plot.png\")\n mean, median, returned_plot_path = f_838(self.large_data_path, plot_path)\n self.assertIsInstance(mean, float)\n self.assertIsInstance(median, float)\n self.assertTrue(os.path.exists(returned_plot_path))\n def test_data_with_nan_values(self):\n \"\"\"Test that the function returns the correct output for a data file with NaN values.\"\"\"\n plot_path = os.path.join(self.test_dir, \"nan_data_plot.png\")\n mean, median, returned_plot_path = f_838(self.nan_data_path, plot_path)\n self.assertNotEqual(mean, np.nan)\n self.assertNotEqual(median, np.nan)\n self.assertTrue(os.path.exists(returned_plot_path))\n def test_single_value_data(self):\n \"\"\"Test that the function returns the correct output for a data file with a single value.\"\"\"\n plot_path = os.path.join(self.test_dir, \"single_value_plot.png\")\n mean, median, returned_plot_path = f_838(self.single_value_path, plot_path)\n self.assertEqual(mean, 42)\n self.assertEqual(median, 42)\n self.assertTrue(os.path.exists(returned_plot_path))\n def test_all_nan_data(self):\n \"\"\"Test that the function returns NaN for mean and median when the file contains all NaN values.\"\"\"\n plot_path = os.path.join(self.test_dir, \"all_nan_plot.png\")\n mean, median, returned_plot_path = f_838(self.all_nan_path, plot_path)\n self.assertTrue(np.isnan(mean))\n self.assertTrue(np.isnan(median))\n self.assertTrue(os.path.exists(returned_plot_path))\n def tearDown(self):\n # Remove all created files\n plt.clf()\n for filename in os.listdir(self.test_dir):\n file_path = os.path.join(self.test_dir, filename)\n if os.path.isfile(file_path) or os.path.islink(file_path):\n os.remove(file_path)\n # Remove the test directory\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["pandas.read_csv", "matplotlib.pyplot.figure", "pandas.Series", "matplotlib.pyplot.savefig", "numpy.mean", "os.path", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.close", "matplotlib.pyplot.title", "numpy.nan", "matplotlib.pyplot.plot", "pandas.errors", "numpy.median", "os.path.isfile", "pandas.to_numeric", "matplotlib.pyplot.xlabel"], "libs": ["os", "numpy", "pandas", "matplotlib"], "doc": {"description": ["Processes a CSV file at the given path by reading its contents, cleaning the data,", "performing statistical analysis, and generating a plot, which is saved to the specified path.", "Sets the title of the plot to \"Data Visualization\".", "Labels the x-axis as \"Index\" and the y-axis as \"Value\".", "Saves the generated plot to the file path specified in 'plot_path'."], "note": [], "params": ["file_path (str): Path to the CSV input file.", "plot_path (str): Path where the plot will be saved."], "returns": ["tuple: A tuple containing the following elements:", "Mean (float): The average value of the data. Returns NaN if data is empty or non-numeric.", "Median (float): The middle value of the data when sorted. Returns NaN if data is empty or non-numeric.", "Plot Path (str): The path where the plot is saved."], "reqs": ["os", "pandas", "matplotlib", "numpy"], "raises": ["FileNotFoundError: If the CSV file at 'file_path' does not exist."], "example": [">>> f_838(\"sample_data.csv\", \"output_plot.png\")", "(25.5, 23.0, \"output_plot.png\")"]}} +{"task_id": "f_366", "prompt": "import matplotlib.pyplot as plt\nimport numpy as np\n\n\ndef f_366(n, seed=0):\n \"\"\"\n Generates a simple scatter plot with 'n' points.\n\n Parameters:\n - n (int): The number of points to be plotted.\n - seed (int, optional): The seed for the random number generator. Defaults to None.\n\n Returns:\n - plot (matplotlib.figure.Figure): The generated plot titled \"Scatter plot of random points\".\n - points (list of tuples): List containing the (x, y) coordinates of the plotted points.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> f_366(5)\n (
, [(0.5488135039273248, 0.6458941130666561), (0.7151893663724195, 0.4375872112626925), (0.6027633760716439, 0.8917730007820798), (0.5448831829968969, 0.9636627605010293), (0.4236547993389047, 0.3834415188257777)])\n \"\"\"", "canonical_solution": " # Setting the random seed for reproducibility\n np.random.seed(seed)\n\n # Generating random points\n x = np.random.rand(n)\n y = np.random.rand(n)\n\n # Plotting\n fig, ax = plt.subplots()\n ax.scatter(x, y)\n ax.set_title(\"Scatter plot of random points\")\n ax.set_xlabel(\"X\")\n ax.set_ylabel(\"Y\")\n\n return fig, list(zip(x, y))", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic point type and structure\n _, points = f_366(5)\n self.assertTrue(\n all(\n isinstance(point, tuple)\n and len(point) == 2\n and all(isinstance(coord, float) for coord in point)\n for point in points\n ),\n \"Points should be a list of tuples with float coordinates\",\n )\n def test_case_2(self):\n # Test parameter 'n'\n for n in [0, 1, 5, 100]:\n plot, points = f_366(n)\n self.assertEqual(len(points), n)\n self.assertTrue(isinstance(plot, type(plt.figure())))\n def test_case_3(self):\n # Test random seed - reproduction\n _, points1 = f_366(5, seed=1)\n _, points2 = f_366(5, seed=1)\n self.assertEqual(\n points1, points2, \"Points generated with the same seed should match exactly\"\n )\n def test_case_4(self):\n # Test random seed - differences\n _, points1 = f_366(5, seed=1)\n _, points2 = f_366(5, seed=10)\n self.assertNotEqual(\n points1, points2, \"Points generated with the same seed should match exactly\"\n )\n def test_case_5(self):\n # Test invalid inputs\n with self.assertRaises(ValueError):\n f_366(-5)\n with self.assertRaises(TypeError):\n f_366(5.5)\n with self.assertRaises(TypeError):\n f_366(\"5\")\n def test_case_6(self):\n # Test visualization\n fig, _ = f_366(1)\n ax = fig.axes[0]\n self.assertEqual(ax.get_title(), \"Scatter plot of random points\")\n self.assertEqual(ax.get_xlabel(), \"X\")\n self.assertEqual(ax.get_ylabel(), \"Y\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "numpy.random", "numpy.random.seed", "numpy.random.rand"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Generates a simple scatter plot with 'n' points."], "note": [], "params": ["n (int): The number of points to be plotted.", "seed (int, optional): The seed for the random number generator. Defaults to None."], "returns": ["plot (matplotlib.figure.Figure): The generated plot titled \"Scatter plot of random points\".", "points (list of tuples): List containing the (x, y) coordinates of the plotted points."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> f_366(5)", "(
, [(0.5488135039273248, 0.6458941130666561), (0.7151893663724195, 0.4375872112626925), (0.6027633760716439, 0.8917730007820798), (0.5448831829968969, 0.9636627605010293), (0.4236547993389047, 0.3834415188257777)])"]}} +{"task_id": "f_899", "prompt": "import pandas as pd\nimport itertools\nimport random\n\n\ndef f_899(colors, states):\n \"\"\"\n Generates a pandas DataFrame containing shuffled combinations of provided colors and states.\n The DataFrame is formatted so that each column represents a series of unique combinations,\n with each combination displayed as \"Color:State\".\n\n Parameters:\n - colors (list): A list of strings representing color names.\n - states (list): A list of strings representing state descriptions.\n\n Returns:\n - df (pandas.DataFrame): A DataFrame where each cell contains a string of the format \"Color:State\".\n The combinations are distributed across columns, with the number of columns being the lesser\n of the lengths of 'colors' and 'states'.\n\n Requirements:\n - pandas\n - itertools\n - random\n\n Note:\n - Cartesian product of 'colors' and 'states',\n - The number of columns in the resulting DataFrame is determined by the smaller number of elements\n in either the 'colors' or 'states' list, ensuring an even distribution without excess empty cells.\n - If the number of combinations is not evenly divisible by the number of columns, some columns\n will have fewer entries.\n\n Example:\n >>> colors = ['Red', 'Blue', 'Green']\n >>> states = ['Solid', 'Liquid']\n >>> color_state_table = f_899(colors, states)\n >>> print(color_state_table)\n Color:State 1 Color:State 2\n 0 Blue:Liquid Red:Liquid\n 1 Blue:Solid Green:Solid\n 2 Red:Solid Green:Liquid\n \"\"\"", "canonical_solution": " combinations = list(itertools.product(colors, states))\n random.seed(42)\n random.shuffle(combinations)\n num_columns = min(len(colors), len(states))\n\n data = {\n f\"Color:State {i+1}\": [\n f\"{comb[0]}:{comb[1]}\" for comb in combinations[i::num_columns]\n ]\n for i in range(num_columns)\n }\n df = pd.DataFrame(data)\n\n return df", "test": "import unittest\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_899.\"\"\"\n def test_empty_lists(self):\n \"\"\"Test with empty color and state lists.\"\"\"\n self.assertEqual(f_899([], []).empty, True)\n def test_single_color_and_state(self):\n \"\"\"Test with one color and one state.\"\"\"\n random.seed(0)\n result = f_899([\"Red\"], [\"Solid\"])\n expected = pd.DataFrame({\"Color:State 1\": [\"Red:Solid\"]})\n pd.testing.assert_frame_equal(result, expected)\n def test_multiple_colors_single_state(self):\n \"\"\"Test with multiple colors and a single state.\"\"\"\n random.seed(1)\n result = f_899([\"Red\", \"Blue\", \"Green\"], [\"Solid\"])\n expected_combinations = set([\"Red:Solid\", \"Blue:Solid\", \"Green:Solid\"])\n result_combinations = set(result[\"Color:State 1\"])\n self.assertEqual(result_combinations, expected_combinations)\n def test_single_color_multiple_states(self):\n \"\"\"Test with a single color and multiple states.\"\"\"\n random.seed(2)\n result = f_899([\"Red\"], [\"Solid\", \"Liquid\", \"Gas\"])\n expected_combinations = set([\"Red:Solid\", \"Red:Liquid\", \"Red:Gas\"])\n result_combinations = set(result[\"Color:State 1\"])\n self.assertEqual(result_combinations, expected_combinations)\n def test_multiple_colors_and_states(self):\n \"\"\"Test with multiple colors and states.\"\"\"\n random.seed(3)\n colors = [\"Red\", \"Blue\"]\n states = [\"Solid\", \"Liquid\"]\n result = f_899(colors, states)\n expected_combinations = set(\n [f\"{color}:{state}\" for color in colors for state in states]\n )\n result_combinations = set(result.values.flatten())\n self.assertEqual(result_combinations, expected_combinations)", "apis": ["pandas.DataFrame", "itertools.product", "random.seed", "random.shuffle"], "libs": ["random", "itertools", "pandas"], "doc": {"description": ["Generates a pandas DataFrame containing shuffled combinations of provided colors and states.", "The DataFrame is formatted so that each column represents a series of unique combinations,", "with each combination displayed as \"Color:State\"."], "note": ["Cartesian product of 'colors' and 'states',", "The number of columns in the resulting DataFrame is determined by the smaller number of elements", "in either the 'colors' or 'states' list, ensuring an even distribution without excess empty cells.", "If the number of combinations is not evenly divisible by the number of columns, some columns", "will have fewer entries."], "params": ["colors (list): A list of strings representing color names.", "states (list): A list of strings representing state descriptions."], "returns": ["df (pandas.DataFrame): A DataFrame where each cell contains a string of the format \"Color:State\".", "The combinations are distributed across columns, with the number of columns being the lesser", "of the lengths of 'colors' and 'states'."], "reqs": ["pandas", "itertools", "random"], "raises": [], "example": [">>> colors = ['Red', 'Blue', 'Green']", ">>> states = ['Solid', 'Liquid']", ">>> color_state_table = f_899(colors, states)", ">>> print(color_state_table)", "Color:State 1 Color:State 2", "0 Blue:Liquid Red:Liquid", "1 Blue:Solid Green:Solid", "2 Red:Solid Green:Liquid"]}} +{"task_id": "f_416", "prompt": "import csv\nfrom collections import Counter\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_416(file_path):\n \"\"\"\n Identifies duplicate rows from a CSV file using the csv library, convert duplicated rows\n into a pandas DataFrame, then plot using matplotlib.\n\n Parameters:\n - file_path (str): The path to the CSV file.\n\n Returns:\n - dict: A dictionary with duplicate rows as keys and their counts as values.\n - Axes: A matplotlib Axes object with the bar chart of duplicate rows.\n\n Requirements:\n - csv\n - collections.Counter\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> duplicates, ax = f_416(\"sample_data.csv\")\n >>> duplicates\n {('Alice', '25', 'New York'): 3, ('Bob', '30', 'London'): 2}\n >>> type(ax)\n \n\n Note: Ensure the CSV file is in proper format and has a .csv extension. Other file formats will raise a ValueError.\n \"\"\"", "canonical_solution": " # Strip the file_path and then check its extension\n file_path = file_path.strip()\n if not file_path.lower().endswith(\".csv\"):\n raise ValueError(\"Invalid file format. Only .csv files are accepted.\")\n\n # Read the CSV file\n with open(file_path, \"r\") as f:\n reader = csv.reader(f)\n rows = list(reader)\n\n # Use Counter to get duplicates\n duplicates = Counter(tuple(row) for row in rows if rows.count(row) > 1)\n\n # Plot the duplicates using matplotlib\n ax = None\n if duplicates:\n df = pd.DataFrame(duplicates.values(), duplicates.keys())\n ax = df.plot(kind=\"bar\", legend=False, title=\"Duplicate Entries\")\n ax.set_ylabel(\"Count\")\n plt.tight_layout()\n\n return duplicates, ax", "test": "import unittest\nimport tempfile\nimport os\nimport matplotlib\nfrom collections import Counter\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.addCleanup(self.temp_dir.cleanup)\n def tearDown(self):\n plt.close(\"all\")\n def create_temp_csv_file(self, content):\n # Create a temporary CSV file within the temp directory\n temp_file_path = os.path.join(self.temp_dir.name, \"temp_file.csv\")\n with open(temp_file_path, \"w\", newline=\"\") as temp_file:\n temp_file.write(content)\n return temp_file_path\n def test_case_1(self):\n # With duplicates - test results\n content = \"Name,Age,City\\nAlice,25,New York\\nAlice,25,New York\\nBob,30,London\\nAlice,25,New York\\nBob,30,London\"\n file_path = self.create_temp_csv_file(content)\n duplicates, _ = f_416(file_path)\n self.assertEqual(\n duplicates,\n Counter({(\"Alice\", \"25\", \"New York\"): 3, (\"Bob\", \"30\", \"London\"): 2}),\n )\n def test_case_2(self):\n # With duplicates - test plot\n content = \"Name,Age,City\\nAlice,25,New York\\nAlice,25,New York\\nBob,30,London\\nAlice,25,New York\\nBob,30,London\"\n file_path = self.create_temp_csv_file(content)\n _, ax = f_416(file_path)\n # Test plot\n self.assertIsNotNone(ax)\n self.assertIsInstance(ax, matplotlib.axes._axes.Axes)\n self.assertEqual(ax.get_title(), \"Duplicate Entries\")\n self.assertEqual(ax.get_ylabel(), \"Count\")\n def test_case_3(self):\n # Without duplicates\n content = \"Name,Age,City\\nEve,28,Paris\\nAdam,32,Berlin\"\n file_path = self.create_temp_csv_file(content)\n duplicates, ax = f_416(file_path)\n self.assertEqual(duplicates, Counter())\n self.assertIsNone(ax)\n def test_case_4(self):\n with self.assertRaises(ValueError):\n f_416(\"sample_data.txt\")\n def test_case_5(self):\n with self.assertRaises(FileNotFoundError):\n f_416(os.path.join(self.temp_dir.name, \"non_existent_file.csv\"))", "apis": ["pandas.DataFrame", "matplotlib.pyplot.tight_layout", "csv.reader", "collections.Counter"], "libs": ["collections", "csv", "pandas", "matplotlib"], "doc": {"description": ["Identifies duplicate rows from a CSV file using the csv library, convert duplicated rows", "into a pandas DataFrame, then plot using matplotlib."], "note": ["Ensure the CSV file is in proper format and has a .csv extension. Other file formats will raise a ValueError."], "params": ["file_path (str): The path to the CSV file."], "returns": ["dict: A dictionary with duplicate rows as keys and their counts as values.", "Axes: A matplotlib Axes object with the bar chart of duplicate rows."], "reqs": ["csv", "collections.Counter", "pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> duplicates, ax = f_416(\"sample_data.csv\")", ">>> duplicates", "{('Alice', '25', 'New York'): 3, ('Bob', '30', 'London'): 2}", ">>> type(ax)", ""]}} +{"task_id": "f_372", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport itertools\n\ndef f_372(n_walks, n_steps, seed=None):\n \"\"\"\n Create and plot `n_walks` number of random walks, each with `n_steps` steps.\n\n The function checks for valid n_walks and n_steps, then generates walks via numpy.\n Each walk is plotted in a different color cycling through a predefined set of colors:\n ['b', 'g', 'r', 'c', 'm', 'y', 'k'].\n\n Parameters:\n - n_walks (int): The number of random walks to be generated and plotted.\n - n_steps (int): The number of steps in each random walk.\n - seed (int, optional): Seed for random number generation. Default is None.\n\n Returns:\n - ax (plt.Axes): A Matplotlib Axes containing the plotted random walks.\n\n Requirements:\n - numpy\n - matplotlib\n - itertools\n\n Example:\n >>> ax = f_372(5, 100, seed=42)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-20.0, 0, '\u221220'), Text(0.0, 0, '0'), Text(20.0, 0, '20'), Text(40.0, 0, '40'), Text(60.0, 0, '60'), Text(80.0, 0, '80'), Text(100.0, 0, '100'), Text(120.0, 0, '120')]\n \"\"\"", "canonical_solution": " if n_walks < 0 or n_steps < 0:\n raise ValueError(\"Walks and steps cannot be negative.\")\n np.random.seed(seed)\n COLORS = [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"]\n color_cycle = itertools.cycle(COLORS)\n fig, ax = plt.subplots()\n for _ in range(n_walks):\n walk = np.random.choice([-1, 1], size=n_steps)\n walk = np.cumsum(walk)\n ax.plot(walk, next(color_cycle))\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic setup\n ax = f_372(5, 100, seed=42)\n self.assertIsInstance(ax, plt.Axes)\n def test_case_2(self):\n # Test number of walks\n for n_walk in [0, 1, 2, 10, 50]:\n ax = f_372(n_walk, 10, seed=42)\n lines = ax.get_lines()\n self.assertEqual(len(lines), n_walk)\n def test_case_3(self):\n # Test number of steps\n for n_steps in [0, 1, 10, 100, 500]:\n ax = f_372(2, n_steps, seed=42)\n lines = ax.get_lines()\n self.assertEqual(len(lines[0].get_ydata()), n_steps)\n def test_case_4(self):\n # Test random seed\n ax1 = f_372(5, 100, seed=42)\n ax2 = f_372(5, 100, seed=42)\n ax3 = f_372(5, 100, seed=0)\n lines1 = ax1.get_lines()\n lines2 = ax2.get_lines()\n lines3 = ax3.get_lines()\n self.assertTrue(\n all(\n np.array_equal(line1.get_ydata(), line2.get_ydata())\n for line1, line2 in zip(lines1, lines2)\n )\n )\n self.assertFalse(\n all(\n np.array_equal(line1.get_ydata(), line3.get_ydata())\n for line1, line3 in zip(lines1, lines3)\n ),\n \"Random walks are not reproducible using the same seed.\",\n )\n def test_case_5(self):\n # Test invalid n_walks\n with self.assertRaises(ValueError):\n f_372(-1, 100, seed=42)\n def test_case_6(self):\n # Test negative n_steps\n with self.assertRaises(ValueError):\n f_372(1, -100, seed=42)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.cumsum", "numpy.random", "itertools.cycle", "matplotlib.pyplot.subplots", "numpy.random.choice", "numpy.random.seed"], "libs": ["itertools", "numpy", "matplotlib"], "doc": {"description": ["Create and plot `n_walks` number of random walks, each with `n_steps` steps.", "The function checks for valid n_walks and n_steps, then generates walks via numpy.", "Each walk is plotted in a different color cycling through a predefined set of colors:", "['b', 'g', 'r', 'c', 'm', 'y', 'k']."], "note": [], "params": ["n_walks (int): The number of random walks to be generated and plotted.", "n_steps (int): The number of steps in each random walk.", "seed (int, optional): Seed for random number generation. Default is None."], "returns": ["ax (plt.Axes): A Matplotlib Axes containing the plotted random walks."], "reqs": ["numpy", "matplotlib", "itertools"], "raises": [], "example": [">>> ax = f_372(5, 100, seed=42)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-20.0, 0, '\u221220'), Text(0.0, 0, '0'), Text(20.0, 0, '20'), Text(40.0, 0, '40'), Text(60.0, 0, '60'), Text(80.0, 0, '80'), Text(100.0, 0, '100'), Text(120.0, 0, '120')]"]}} +{"task_id": "f_392", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_392(days, random_seed=0):\n \"\"\"\n Generates a spending report DataFrame for the given number of days.\n\n This function takes a number of days as input and populates a pandas DataFrame\n with fake expenditure data indexed by date. Each day on or after '2023-01-01'\n has its own row. The DataFrame has five columns: Groceries, Entertainment, Rent,\n Utilities, and Miscellaneous, with their integer values independently randomly\n sampled from 0 to 100.\n\n Parameters:\n - days (int): Number of days for which the report is to be generated.\n This is used to generate dates starting from '2023-01-01'.\n For example, a 'days' of 2 will generate data for '2023-01-01',\n '2023-01-02'.\n If 0, this function will return a DataFrame with the expected\n columns that is otherwise empty.\n - random_seed (int): Numpy random seed for reproducibility. Defaults to 0.\n\n Returns:\n - pd.DataFrame: A DataFrame containing spending details for specified days,\n with shape (num_days, 5).\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> df = f_392(5, random_seed=42)\n >>> type(df)\n \n >>> df.head(2)\n Groceries Entertainment Rent Utilities Miscellaneous\n date \n 2023-01-01 51 20 87 52 1\n 2023-01-02 92 82 99 1 63\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n date_rng = pd.date_range(start=\"2023-01-01\", periods=days, freq=\"D\")\n df = pd.DataFrame(date_rng, columns=[\"date\"])\n df.set_index(\"date\", inplace=True)\n categories = [\"Groceries\", \"Entertainment\", \"Rent\", \"Utilities\", \"Miscellaneous\"]\n for category in categories:\n df[category] = np.random.randint(0, 100, size=(days))\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n report_columns = [\n \"Groceries\",\n \"Entertainment\",\n \"Rent\",\n \"Utilities\",\n \"Miscellaneous\",\n ]\n start_date = pd.to_datetime([\"2023-01-01\"]).day\n def _test_report_structure(self, report, days):\n self.assertIsInstance(report, pd.DataFrame)\n self.assertEqual(report.shape[0], days)\n self.assertEqual(report.shape[1], len(self.report_columns))\n self.assertEqual(list(report.columns), self.report_columns)\n def _test_report_data(self, report):\n self.assertFalse(report.isnull().values.any())\n self.assertTrue(pd.api.types.is_datetime64_ns_dtype(report.index))\n self.assertTrue(report.index.day.map(lambda d: d >= self.start_date).all())\n for col in report:\n self.assertTrue((report[col] >= 0).all() and (report[col] <= 100).all())\n def _test_report(self, report, days):\n self._test_report_structure(report, days)\n self._test_report_data(report)\n def test_case_1(self):\n # Test basic case with default parameters\n days = 7\n report = f_392(days)\n self._test_report(report, days)\n def test_case_2(self):\n # Test handling 0 days\n days = 0\n report = f_392(days)\n self._test_report(report, days)\n def test_case_3(self):\n # Test handling larger number of days\n days = 1000\n report = f_392(days)\n self._test_report(report, days)\n def test_case_4(self):\n # Test handling invalid inputs\n with self.assertRaises(ValueError):\n f_392(-1)\n with self.assertRaises(ValueError):\n f_392(None)\n with self.assertRaises(TypeError):\n f_392(\"-1\")\n def test_case_5(self):\n # Test random seed reproducibility\n days = 100\n report1 = f_392(days, random_seed=42)\n report2 = f_392(days, random_seed=42)\n self.assertTrue(report1.equals(report2))\n self._test_report(report1, days)\n self._test_report(report2, days)\n def test_case_6(self):\n # Test random seed variation\n days = 100\n report1 = f_392(days, random_seed=24)\n report2 = f_392(days, random_seed=42)\n self.assertFalse(report1.equals(report2))\n self._test_report(report1, days)\n self._test_report(report2, days)", "apis": ["numpy.random.randint", "pandas.DataFrame", "numpy.random", "pandas.date_range", "numpy.random.seed"], "libs": ["pandas", "numpy"], "doc": {"description": ["Generates a spending report DataFrame for the given number of days.", "This function takes a number of days as input and populates a pandas DataFrame", "with fake expenditure data indexed by date. Each day on or after '2023-01-01'", "has its own row. The DataFrame has five columns: Groceries, Entertainment, Rent,", "Utilities, and Miscellaneous, with their integer values independently randomly", "sampled from 0 to 100."], "note": [], "params": ["days (int): Number of days for which the report is to be generated.", "This is used to generate dates starting from '2023-01-01'.", "For example, a 'days' of 2 will generate data for '2023-01-01',", "'2023-01-02'.", "If 0, this function will return a DataFrame with the expected", "columns that is otherwise empty.", "random_seed (int): Numpy random seed for reproducibility. Defaults to 0."], "returns": ["pd.DataFrame: A DataFrame containing spending details for specified days,", "with shape (num_days, 5)."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> df = f_392(5, random_seed=42)", ">>> type(df)", "", ">>> df.head(2)", "Groceries Entertainment Rent Utilities Miscellaneous", "date", "2023-01-01 51 20 87 52 1", "2023-01-02 92 82 99 1 63"]}} {"task_id": "f_755", "prompt": "from typing import List, Union\nimport numpy as np\n\ndef f_755(data: List[Union[int, str]], repetitions: int = 1):\n \"\"\"\n Calculates the mode(s) and their count(s) in a list of elements that can be repeated a specified number of times.\n \n Parameters:\n - data (List[Union[int, str]]): The original list of elements (integers and/or strings).\n - repetitions (int, optional): The number of times to repeat the original list before calculating the mode. Defaults to 1.\n\n Requirements:\n - numpy\n - typing.List\n - typing.Union\n\n Returns:\n - dict: A dictionary with two keys:\n 'mode': a numpy array of the mode(s), sorted in ascending order.\n 'count': a numpy array of the count(s) of the mode(s).\n \"\"\"", "canonical_solution": " \n def calculate_mode(data: List[Union[int, str]]):\n # Use a dictionary to count occurrences, considering both value and type\n counts = {}\n for item in data:\n key = (item, type(item)) # Distinguish between types\n counts[key] = counts.get(key, 0) + 1\n\n # Find the maximum count and corresponding values\n max_count = max(counts.values())\n mode_items = [value for (value, value_type), count in counts.items() if count == max_count]\n\n return mode_items, [max_count] * len(mode_items)\n \n if not data or repetitions <= 0: # Handle empty data or no repetitions\n return {'mode': np.array([], dtype='object'), 'count': np.array([], dtype=int)}\n\n # Repeat the data\n repeated_data = data * repetitions\n\n # Calculate mode\n mode, count = calculate_mode(repeated_data)\n return {'mode': np.sort(mode), 'count': count}", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_empty_list(self):\n expected = {'mode': np.array([], dtype='object').tolist(), 'count': np.array([], dtype=int).tolist()}\n result = f_755([], repetitions=1)\n self.assertEqual({'mode': result['mode'].tolist(), 'count': result['count'].tolist()}, expected)\n def test_single_mode(self):\n result = f_755([1, 2, 2, 3], repetitions=1)\n np.testing.assert_array_equal(result['mode'], np.array([2]))\n np.testing.assert_array_equal(result['count'], np.array([2]))\n def test_multiple_modes_repeated(self):\n result = f_755(['A', 'B'], repetitions=3)\n np.testing.assert_array_equal(result['mode'], np.array(['A', 'B']))\n np.testing.assert_array_equal(result['count'], np.array([3, 3]))\n def test_mixed_types(self):\n # Assuming '1' (string) appears twice, and 1 (int) appears once.\n # The test expects the string '1' to be the mode with a count of 2.\n result = f_755([1, '1', '1', 2], repetitions=1)\n np.testing.assert_array_equal(result['mode'], np.array(['1']))\n np.testing.assert_array_equal(result['count'], np.array([2])) # Expected count is 2 for '1'\n def test_no_repetitions(self):\n expected = {'mode': np.array([], dtype='object').tolist(), 'count': np.array([], dtype=int).tolist()}\n result = f_755(['X', 'Y', 'Z'], repetitions=0)\n self.assertEqual({'mode': result['mode'].tolist(), 'count': result['count'].tolist()}, expected)", "apis": ["numpy.sort", "numpy.array"], "libs": ["numpy"], "doc": {"description": ["Calculates the mode(s) and their count(s) in a list of elements that can be repeated a specified number of times."], "note": [], "params": ["data (List[Union[int, str]]): The original list of elements (integers and/or strings).", "repetitions (int, optional): The number of times to repeat the original list before calculating the mode. Defaults to 1."], "returns": ["dict: A dictionary with two keys:", "'mode': a numpy array of the mode(s), sorted in ascending order.", "'count': a numpy array of the count(s) of the mode(s)."], "reqs": ["numpy", "typing.List", "typing.Union"], "raises": [], "example": []}} -{"task_id": "f_796", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport os\n\n\ndef f_796(mystrings, folder_path, seed=None):\n \"\"\"\n Generates random data points to plot bar charts for each in a given list of plot names,\n then saves them in a specified directory.\n\n This function takes a list of plot names, for each generating 10 random data points in [0, 1)\n to create a bar chart, then saves the bar charts as .png files in the specified directory,\n creating the directory if it does not exist.\n\n Parameters:\n - mystrings (list of str): List of names for the plots.\n Each is used as the title for each plot, and each is used to derive\n each plot's filename by replacing spaces with underscores.\n - folder_path (str): Path of the folder where the plots will be saved.\n If it does not exist, the function will create it.\n - seed (int, optional): A seed for the random number generator to ensure reproducible results.\n Defaults to None.\n\n Returns:\n - list: Names of the files where the plots are saved. Each file corresponds to a title from `mystrings`.\n\n Raises:\n - FileNotFoundError: If the provided directory path does not exist and cannot be created.\n\n Note:\n - This function deduplicates mystrings while maintaining its original order.\n - Random data points for bar charts are generated in the range [0, 1).\n - Each bar chart contains 10 data points.\n\n Requirements:\n - numpy\n - matplotlib\n - os\n\n Examples:\n >>> f_796(['Plot 1', 'Plot 2'], './test_images/')\n ['Plot_1.png', 'Plot_2.png']\n\n >>> f_796(['First Plot', 'Second Plot'], './another_folder/')\n ['First_Plot.png', 'Second_Plot.png']\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n\n saved_plots = []\n processed_names = set()\n\n if not os.path.exists(folder_path):\n os.makedirs(folder_path, exist_ok=True)\n\n for name in mystrings:\n if name in processed_names:\n continue\n data = np.random.rand(10)\n plt.bar(range(len(data)), data)\n plt.title(name)\n file_name = name.replace(\" \", \"_\") + \".png\"\n plt.savefig(os.path.join(folder_path, file_name))\n saved_plots.append(file_name)\n processed_names.add(name)\n\n return saved_plots", "test": "import unittest\nimport os\nimport matplotlib.pyplot as plt\nimport shutil\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.test_dir = 'test_images'\n \n def tearDown(self):\n if os.path.exists(self.test_dir):\n shutil.rmtree(self.test_dir)\n def test_case_1(self):\n # Test with a list of two plot names\n output = f_796([\"Plot 1\", \"Plot 2\"], self.test_dir, seed=1)\n expected = [\"Plot_1.png\", \"Plot_2.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(self.test_dir, file_name)))\n def test_case_2(self):\n # Test directory creation if not exists\n path = os.path.join(self.test_dir, \"foo\", \"bar\", \"temp\")\n self.assertFalse(os.path.exists(path))\n output = f_796([\"Test A\", \"Test B\", \"Test C\"], path, seed=2)\n expected = [\"Test_A.png\", \"Test_B.png\", \"Test_C.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(path, file_name)))\n def test_case_3(self):\n # Test with an empty list of plot names to ensure no files are created.\n output = f_796([], self.test_dir, seed=3)\n self.assertEqual(output, [])\n self.assertEqual(len(os.listdir(self.test_dir)), 0)\n def test_case_4(self):\n # Test with a list of plot names containing special characters.\n output = f_796([\"Test@A\", \"Test#B\", \"Test&C\"], self.test_dir, seed=4)\n expected = [\"Test@A.png\", \"Test#B.png\", \"Test&C.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(self.test_dir, file_name)))\n def test_case_5(self):\n # Test with a single-element list of plot names, ensuring the function can handle minimal input.\n output = f_796([\"Single Plot\"], self.test_dir, seed=5)\n expected = [\"Single_Plot.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(self.test_dir, file_name)))\n def test_case_6(self):\n # Test with name deduplication\n output = f_796([\"Single Plot\"] * 5, self.test_dir, seed=6)\n expected = [\"Single_Plot.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(self.test_dir, file_name)))", "apis": ["os.path.exists", "numpy.random", "os.path", "numpy.random.seed", "matplotlib.pyplot.title", "matplotlib.pyplot.savefig", "numpy.random.rand", "matplotlib.pyplot.bar", "os.makedirs", "os.path.join"], "libs": ["numpy", "matplotlib", "os"], "doc": {"description": ["Generates random data points to plot bar charts for each in a given list of plot names,", "then saves them in a specified directory.", "This function takes a list of plot names, for each generating 10 random data points in [0, 1)", "to create a bar chart, then saves the bar charts as .png files in the specified directory,", "creating the directory if it does not exist.", ">>> f_796(['First Plot', 'Second Plot'], './another_folder/')", "['First_Plot.png', 'Second_Plot.png']"], "note": ["This function deduplicates mystrings while maintaining its original order.", "Random data points for bar charts are generated in the range [0, 1).", "Each bar chart contains 10 data points."], "params": ["mystrings (list of str): List of names for the plots.", "Each is used as the title for each plot, and each is used to derive", "each plot's filename by replacing spaces with underscores.", "folder_path (str): Path of the folder where the plots will be saved.", "If it does not exist, the function will create it.", "seed (int, optional): A seed for the random number generator to ensure reproducible results.", "Defaults to None."], "returns": ["list: Names of the files where the plots are saved. Each file corresponds to a title from `mystrings`."], "reqs": ["numpy", "matplotlib", "os"], "raises": ["FileNotFoundError: If the provided directory path does not exist and cannot be created."], "example": ["Examples:", ">>> f_796(['Plot 1', 'Plot 2'], './test_images/')", "['Plot_1.png', 'Plot_2.png']"]}} -{"task_id": "f_407", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_407(data):\n \"\"\"\n Combine a list of dictionaries with the same keys (fruit names) into a single pandas dataframe\n where NA/NaN values are filled with 0, then generate a line chart of sales.\n The chart should have title 'Fruit Sales over Time', x-axis 'Time', and y-axis 'Sales Quantity'.\n\n Parameters:\n - data (list): A list of dictionaries. Each element correspond to sales quantities at a point in time,\n where keys are fruit names (str) and values are sales quantities (int). If values\n are not the expected type, this function raises TypeError.\n\n Returns:\n - matplotlib.axes._subplots.AxesSubplot: The generated plot's Axes object.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> f_407([{'apple': 10, 'banana': 15, 'cherry': 12, 'durian': 0}])\n \n >>> f_407([{'apple': 10, 'banana': 15, 'cherry': 12}, {'apple': 12, 'banana': 20, 'cherry': 14}])\n \n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n df.fillna(0, inplace=True)\n for fruit in df.columns:\n plt.plot(df[fruit], label=fruit)\n plt.xlabel(\"Time\")\n plt.ylabel(\"Sales Quantity\")\n plt.title(\"Fruit Sales over Time\")\n plt.legend()\n return plt.gca()", "test": "import unittest\nimport matplotlib\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n data = [{\"apple\": 10}, {\"banana\": 15, \"cherry\": 12}]\n ax = f_407(data)\n # Test default plot values\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertTrue(isinstance(ax.lines[0], matplotlib.lines.Line2D))\n self.assertEqual(ax.get_title(), \"Fruit Sales over Time\")\n self.assertEqual(ax.get_xlabel(), \"Time\")\n self.assertEqual(ax.get_ylabel(), \"Sales Quantity\")\n def test_case_2(self):\n # Test flat input\n data = [{\"apple\": 11, \"banana\": 15, \"cherry\": 12, \"durian\": 10}]\n ax = f_407(data)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(len(ax.lines), len(data[0]))\n for i, (fruit_name, fruit_quantity) in enumerate(data[0].items()):\n self.assertEqual(ax.lines[i]._label, fruit_name)\n self.assertEqual(ax.lines[i]._y, fruit_quantity)\n self.assertIsInstance(ax.lines[i], matplotlib.lines.Line2D)\n def test_case_3(self):\n data = [\n {\"apple\": 15},\n {\"apple\": 2, \"banana\": 11, \"cherry\": 8},\n ]\n ax = f_407(data)\n # Test data correctness\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(len(ax.lines), 3)\n self.assertEqual(ax.lines[0]._label, \"apple\")\n self.assertEqual(ax.lines[0]._y.tolist(), [15, 2])\n self.assertEqual(ax.lines[1]._label, \"banana\")\n self.assertEqual(ax.lines[1]._y.tolist(), [0, 11])\n self.assertEqual(ax.lines[2]._label, \"cherry\")\n self.assertEqual(ax.lines[2]._y.tolist(), [0, 8])\n def test_case_4(self):\n # Test one fruit only\n data = [{\"apple\": 10}, {\"apple\": 12}, {\"apple\": 15}]\n ax = f_407(data)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(len(ax.lines), 1)\n self.assertEqual(ax.lines[0]._label, \"apple\")\n self.assertEqual(ax.lines[0]._y.tolist(), [10, 12, 15])\n def test_case_5(self):\n # Test that function fails with unexpected data values\n with self.assertRaises(ValueError):\n f_407(\"\")\n with self.assertRaises(ValueError):\n f_407(1)\n # Test that function fails with unexpected data types\n with self.assertRaises(TypeError):\n f_407([\"apple\", 10, \"banana\", 10])\n with self.assertRaises(TypeError):\n f_407([{\"apple\": \"10\"}, {\"cherry\": 10}])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.xlabel", "matplotlib.pyplot.plot", "matplotlib.pyplot.legend", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "pandas.DataFrame"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Combine a list of dictionaries with the same keys (fruit names) into a single pandas dataframe", "where NA/NaN values are filled with 0, then generate a line chart of sales.", "The chart should have title 'Fruit Sales over Time', x-axis 'Time', and y-axis 'Sales Quantity'."], "note": [], "params": ["data (list): A list of dictionaries. Each element correspond to sales quantities at a point in time,", "where keys are fruit names (str) and values are sales quantities (int). If values", "are not the expected type, this function raises TypeError."], "returns": ["matplotlib.axes._subplots.AxesSubplot: The generated plot's Axes object."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> f_407([{'apple': 10, 'banana': 15, 'cherry': 12, 'durian': 0}])", "", ">>> f_407([{'apple': 10, 'banana': 15, 'cherry': 12}, {'apple': 12, 'banana': 20, 'cherry': 14}])", ""]}} +{"task_id": "f_796", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport os\n\n\ndef f_796(mystrings, folder_path, seed=None):\n \"\"\"\n Generates random data points to plot bar charts for each in a given list of plot names,\n then saves them in a specified directory.\n\n This function takes a list of plot names, for each generating 10 random data points in [0, 1)\n to create a bar chart, then saves the bar charts as .png files in the specified directory,\n creating the directory if it does not exist.\n\n Parameters:\n - mystrings (list of str): List of names for the plots.\n Each is used as the title for each plot, and each is used to derive\n each plot's filename by replacing spaces with underscores.\n - folder_path (str): Path of the folder where the plots will be saved.\n If it does not exist, the function will create it.\n - seed (int, optional): A seed for the random number generator to ensure reproducible results.\n Defaults to None.\n\n Returns:\n - list: Names of the files where the plots are saved. Each file corresponds to a title from `mystrings`.\n\n Raises:\n - FileNotFoundError: If the provided directory path does not exist and cannot be created.\n\n Note:\n - This function deduplicates mystrings while maintaining its original order.\n - Random data points for bar charts are generated in the range [0, 1).\n - Each bar chart contains 10 data points.\n\n Requirements:\n - numpy\n - matplotlib\n - os\n\n Examples:\n >>> f_796(['Plot 1', 'Plot 2'], './test_images/')\n ['Plot_1.png', 'Plot_2.png']\n\n >>> f_796(['First Plot', 'Second Plot'], './another_folder/')\n ['First_Plot.png', 'Second_Plot.png']\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n\n saved_plots = []\n processed_names = set()\n\n if not os.path.exists(folder_path):\n os.makedirs(folder_path, exist_ok=True)\n\n for name in mystrings:\n if name in processed_names:\n continue\n data = np.random.rand(10)\n plt.bar(range(len(data)), data)\n plt.title(name)\n file_name = name.replace(\" \", \"_\") + \".png\"\n plt.savefig(os.path.join(folder_path, file_name))\n saved_plots.append(file_name)\n processed_names.add(name)\n\n return saved_plots", "test": "import unittest\nimport os\nimport matplotlib.pyplot as plt\nimport shutil\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.test_dir = 'test_images'\n \n def tearDown(self):\n if os.path.exists(self.test_dir):\n shutil.rmtree(self.test_dir)\n def test_case_1(self):\n # Test with a list of two plot names\n output = f_796([\"Plot 1\", \"Plot 2\"], self.test_dir, seed=1)\n expected = [\"Plot_1.png\", \"Plot_2.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(self.test_dir, file_name)))\n def test_case_2(self):\n # Test directory creation if not exists\n path = os.path.join(self.test_dir, \"foo\", \"bar\", \"temp\")\n self.assertFalse(os.path.exists(path))\n output = f_796([\"Test A\", \"Test B\", \"Test C\"], path, seed=2)\n expected = [\"Test_A.png\", \"Test_B.png\", \"Test_C.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(path, file_name)))\n def test_case_3(self):\n # Test with an empty list of plot names to ensure no files are created.\n output = f_796([], self.test_dir, seed=3)\n self.assertEqual(output, [])\n self.assertEqual(len(os.listdir(self.test_dir)), 0)\n def test_case_4(self):\n # Test with a list of plot names containing special characters.\n output = f_796([\"Test@A\", \"Test#B\", \"Test&C\"], self.test_dir, seed=4)\n expected = [\"Test@A.png\", \"Test#B.png\", \"Test&C.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(self.test_dir, file_name)))\n def test_case_5(self):\n # Test with a single-element list of plot names, ensuring the function can handle minimal input.\n output = f_796([\"Single Plot\"], self.test_dir, seed=5)\n expected = [\"Single_Plot.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(self.test_dir, file_name)))\n def test_case_6(self):\n # Test with name deduplication\n output = f_796([\"Single Plot\"] * 5, self.test_dir, seed=6)\n expected = [\"Single_Plot.png\"]\n self.assertEqual(output, expected)\n for file_name in expected:\n self.assertTrue(os.path.exists(os.path.join(self.test_dir, file_name)))", "apis": ["os.path.exists", "matplotlib.pyplot.bar", "numpy.random", "matplotlib.pyplot.savefig", "os.makedirs", "os.path", "numpy.random.rand", "matplotlib.pyplot.title", "os.path.join", "numpy.random.seed"], "libs": ["os", "numpy", "matplotlib"], "doc": {"description": ["Generates random data points to plot bar charts for each in a given list of plot names,", "then saves them in a specified directory.", "This function takes a list of plot names, for each generating 10 random data points in [0, 1)", "to create a bar chart, then saves the bar charts as .png files in the specified directory,", "creating the directory if it does not exist.", ">>> f_796(['First Plot', 'Second Plot'], './another_folder/')", "['First_Plot.png', 'Second_Plot.png']"], "note": ["This function deduplicates mystrings while maintaining its original order.", "Random data points for bar charts are generated in the range [0, 1).", "Each bar chart contains 10 data points."], "params": ["mystrings (list of str): List of names for the plots.", "Each is used as the title for each plot, and each is used to derive", "each plot's filename by replacing spaces with underscores.", "folder_path (str): Path of the folder where the plots will be saved.", "If it does not exist, the function will create it.", "seed (int, optional): A seed for the random number generator to ensure reproducible results.", "Defaults to None."], "returns": ["list: Names of the files where the plots are saved. Each file corresponds to a title from `mystrings`."], "reqs": ["numpy", "matplotlib", "os"], "raises": ["FileNotFoundError: If the provided directory path does not exist and cannot be created."], "example": ["Examples:", ">>> f_796(['Plot 1', 'Plot 2'], './test_images/')", "['Plot_1.png', 'Plot_2.png']"]}} +{"task_id": "f_407", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_407(data):\n \"\"\"\n Combine a list of dictionaries with the same keys (fruit names) into a single pandas dataframe\n where NA/NaN values are filled with 0, then generate a line chart of sales.\n The chart should have title 'Fruit Sales over Time', x-axis 'Time', and y-axis 'Sales Quantity'.\n\n Parameters:\n - data (list): A list of dictionaries. Each element correspond to sales quantities at a point in time,\n where keys are fruit names (str) and values are sales quantities (int). If values\n are not the expected type, this function raises TypeError.\n\n Returns:\n - matplotlib.axes._subplots.Axes: The generated plot's Axes object.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> f_407([{'apple': 10, 'banana': 15, 'cherry': 12, 'durian': 0}])\n \n >>> f_407([{'apple': 10, 'banana': 15, 'cherry': 12}, {'apple': 12, 'banana': 20, 'cherry': 14}])\n \n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n df.fillna(0, inplace=True)\n for fruit in df.columns:\n plt.plot(df[fruit], label=fruit)\n plt.xlabel(\"Time\")\n plt.ylabel(\"Sales Quantity\")\n plt.title(\"Fruit Sales over Time\")\n plt.legend()\n return plt.gca()", "test": "import unittest\nimport matplotlib\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n data = [{\"apple\": 10}, {\"banana\": 15, \"cherry\": 12}]\n ax = f_407(data)\n # Test default plot values\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertTrue(isinstance(ax.lines[0], matplotlib.lines.Line2D))\n self.assertEqual(ax.get_title(), \"Fruit Sales over Time\")\n self.assertEqual(ax.get_xlabel(), \"Time\")\n self.assertEqual(ax.get_ylabel(), \"Sales Quantity\")\n def test_case_2(self):\n # Test flat input\n data = [{\"apple\": 11, \"banana\": 15, \"cherry\": 12, \"durian\": 10}]\n ax = f_407(data)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(len(ax.lines), len(data[0]))\n for i, (fruit_name, fruit_quantity) in enumerate(data[0].items()):\n self.assertEqual(ax.lines[i]._label, fruit_name)\n self.assertEqual(ax.lines[i]._y, fruit_quantity)\n self.assertIsInstance(ax.lines[i], matplotlib.lines.Line2D)\n def test_case_3(self):\n data = [\n {\"apple\": 15},\n {\"apple\": 2, \"banana\": 11, \"cherry\": 8},\n ]\n ax = f_407(data)\n # Test data correctness\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(len(ax.lines), 3)\n self.assertEqual(ax.lines[0]._label, \"apple\")\n self.assertEqual(ax.lines[0]._y.tolist(), [15, 2])\n self.assertEqual(ax.lines[1]._label, \"banana\")\n self.assertEqual(ax.lines[1]._y.tolist(), [0, 11])\n self.assertEqual(ax.lines[2]._label, \"cherry\")\n self.assertEqual(ax.lines[2]._y.tolist(), [0, 8])\n def test_case_4(self):\n # Test one fruit only\n data = [{\"apple\": 10}, {\"apple\": 12}, {\"apple\": 15}]\n ax = f_407(data)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(len(ax.lines), 1)\n self.assertEqual(ax.lines[0]._label, \"apple\")\n self.assertEqual(ax.lines[0]._y.tolist(), [10, 12, 15])\n def test_case_5(self):\n # Test that function fails with unexpected data values\n with self.assertRaises(ValueError):\n f_407(\"\")\n with self.assertRaises(ValueError):\n f_407(1)\n # Test that function fails with unexpected data types\n with self.assertRaises(TypeError):\n f_407([\"apple\", 10, \"banana\", 10])\n with self.assertRaises(TypeError):\n f_407([{\"apple\": \"10\"}, {\"cherry\": 10}])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.legend", "pandas.DataFrame", "matplotlib.pyplot.title", "matplotlib.pyplot.plot", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "matplotlib.pyplot.xlabel"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Combine a list of dictionaries with the same keys (fruit names) into a single pandas dataframe", "where NA/NaN values are filled with 0, then generate a line chart of sales.", "The chart should have title 'Fruit Sales over Time', x-axis 'Time', and y-axis 'Sales Quantity'."], "note": [], "params": ["data (list): A list of dictionaries. Each element correspond to sales quantities at a point in time,", "where keys are fruit names (str) and values are sales quantities (int). If values", "are not the expected type, this function raises TypeError."], "returns": ["matplotlib.axes._subplots.Axes: The generated plot's Axes object."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> f_407([{'apple': 10, 'banana': 15, 'cherry': 12, 'durian': 0}])", "", ">>> f_407([{'apple': 10, 'banana': 15, 'cherry': 12}, {'apple': 12, 'banana': 20, 'cherry': 14}])", ""]}} {"task_id": "f_351", "prompt": "import matplotlib.pyplot as plt\nfrom sklearn.datasets import make_blobs\n\n\ndef f_351(n_samples=100, centers=3, n_features=2, random_seed=42):\n \"\"\"\n Create isotropic Gaussian blobs to form clusters and visualize them.\n\n Parameters:\n - n_samples (int): The total number of points divided among clusters.\n - centers (int): The number of centers to generate.\n - n_features (int): The number of features for each sample.\n - random_seed (int): The seed for the random number generator.\n\n Returns:\n tuple: A tuple containing:\n - X (numpy.ndarray): The matrix of blob points.\n - y (numpy.ndarray): The vector of blob labels.\n - ax (matplotlib.axes.Axes): The Axes object with the scatter plot.\n\n Requirements:\n - matplotlib.pyplot\n - sklearn\n\n Example:\n >>> X, y, ax = f_351(n_samples=500, centers=5, random_seed=0)\n >>> type(X), type(y), type(ax)\n (, , )\n >>> ax\n \n \"\"\"", "canonical_solution": " X, y = make_blobs(\n n_samples=n_samples,\n centers=centers,\n n_features=n_features,\n random_state=random_seed,\n )\n\n fig, ax = plt.subplots()\n ax.scatter(X[:, 0], X[:, 1], c=y)\n\n return X, y, ax", "test": "import unittest\nimport matplotlib\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test default case\n n_samples, n_features, centers = 100, 2, 3\n X, y, ax = f_351()\n self.assertEqual(X.shape, (n_samples, n_features))\n self.assertEqual(y.shape, (n_samples,))\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(set(y)), centers)\n def test_case_2(self):\n # Test n_samples\n for n_samples in [1, 50, 100]:\n X, y, _ = f_351(n_samples=n_samples)\n self.assertEqual(X.shape[0], n_samples)\n self.assertEqual(y.shape[0], n_samples)\n def test_case_3(self):\n # Test centers\n for centers in [1, 50, 100]:\n _, y, _ = f_351(centers=centers)\n self.assertEqual(len(set(y)), centers)\n def test_case_4(self):\n # Test n_features\n for n_features in [2, 50, 100]:\n X, y, _ = f_351(n_features=n_features)\n self.assertEqual(X.shape[1], n_features)\n def test_case_5(self):\n # Test random seed\n X1, y1, _ = f_351(n_samples=100, centers=3, n_features=2, random_seed=42)\n X2, y2, _ = f_351(n_samples=100, centers=3, n_features=2, random_seed=42)\n self.assertTrue((X1 == X2).all())\n self.assertTrue((y1 == y2).all())\n def test_case_6(self):\n # Test with the minimum possible values that are still valid\n n_samples, n_features, centers = 1, 2, 1\n X, y, ax = f_351(\n n_samples=1, centers=centers, n_features=n_features, random_seed=0\n )\n self.assertEqual(X.shape, (n_samples, n_features))\n self.assertEqual(y.shape, (n_samples,))\n self.assertEqual(len(set(y)), centers)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n def test_case_7(self):\n # Example of handling an expected failure due to invalid input\n with self.assertRaises(ValueError):\n f_351(n_samples=-100)\n with self.assertRaises(ValueError):\n f_351(centers=-10)\n with self.assertRaises(Exception):\n f_351(n_features=0)\n with self.assertRaises(ValueError):\n f_351(random_seed=\"invalid\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["sklearn.datasets.make_blobs", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "sklearn"], "doc": {"description": ["Create isotropic Gaussian blobs to form clusters and visualize them."], "note": [], "params": ["n_samples (int): The total number of points divided among clusters.", "centers (int): The number of centers to generate.", "n_features (int): The number of features for each sample.", "random_seed (int): The seed for the random number generator."], "returns": ["tuple: A tuple containing:", "X (numpy.ndarray): The matrix of blob points.", "y (numpy.ndarray): The vector of blob labels.", "ax (matplotlib.axes.Axes): The Axes object with the scatter plot."], "reqs": ["matplotlib.pyplot", "sklearn"], "raises": [], "example": [">>> X, y, ax = f_351(n_samples=500, centers=5, random_seed=0)", ">>> type(X), type(y), type(ax)", "(, , )", ">>> ax", ""]}} -{"task_id": "f_891", "prompt": "from datetime import datetime\nimport random\nimport matplotlib.pyplot as plt\n\n\ndef f_891(date_str):\n \"\"\"\n Generates a list of random integers, where the count of integers equals the day of the month in the\n provided date, then generates a line plot of these integers and returns the Axes object of the plot.\n\n Parameters:\n - date_str (str): The date string in \"yyyy-mm-dd\" format.\n\n Returns:\n - matplotlib.axes.Axes: The Axes object containing the plot.\n\n Requirements:\n - datetime.datetime\n - random\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_891('2023-06-15')\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " date = datetime.strptime(date_str, \"%Y-%m-%d\")\n num_of_values = date.day\n random_values = [random.randint(1, 100) for _ in range(num_of_values)]\n _, ax = plt.subplots()\n ax.plot(random_values)\n return ax", "test": "import unittest\nimport matplotlib.axes\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_891.\"\"\"\n def test_mid_month(self):\n \"\"\"\n Test the function with a mid-month date.\n Checks if the generated plot has 15 data points for a date like '2023-06-15'.\n \"\"\"\n ax = f_891(\"2023-06-15\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines[0].get_ydata()), 15)\n def test_beginning_of_month(self):\n \"\"\"\n Test the function with a date at the beginning of the month.\n Checks if the plot has 1 data point for a date like '2023-06-01'.\n \"\"\"\n ax = f_891(\"2023-06-01\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines[0].get_ydata()), 1)\n def test_end_of_month(self):\n \"\"\"\n Test the function with a date at the end of the month.\n Checks if the plot has 31 data points for a date like '2023-07-31'.\n \"\"\"\n ax = f_891(\"2023-07-31\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines[0].get_ydata()), 31)\n def test_leap_year(self):\n \"\"\"\n Test the function with a leap year date.\n Checks if the plot has 29 data points for a leap year date like '2024-02-29'.\n \"\"\"\n ax = f_891(\"2024-02-29\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines[0].get_ydata()), 29)\n def test_invalid_date(self):\n \"\"\"\n Test the function with an invalid date format.\n Expects a ValueError to be raised for an incorrectly formatted date.\n \"\"\"\n with self.assertRaises(ValueError):\n f_891(\"2023/06/15\")\n def tearDown(self):\n plt.clf()", "apis": ["random.randint", "matplotlib.pyplot.subplots", "datetime.datetime.strptime"], "libs": ["matplotlib", "random", "datetime"], "doc": {"description": ["Generates a list of random integers, where the count of integers equals the day of the month in the", "provided date, then generates a line plot of these integers and returns the Axes object of the plot."], "note": [], "params": ["date_str (str): The date string in \"yyyy-mm-dd\" format."], "returns": ["matplotlib.axes.Axes: The Axes object containing the plot."], "reqs": ["datetime.datetime", "random", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_891('2023-06-15')", ">>> type(ax)", ""]}} -{"task_id": "f_744", "prompt": "import pandas as pd\nfrom collections import Counter\nimport unittest\n\ndef f_744(d):\n \"\"\"\n Count the occurrence of values with the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d.\"\n\n Parameters:\n d (list): A list of dictionaries.\n\n Returns:\n dict: A dictionary with keys as 'x', 'y', and 'z' and values as Counter objects.\n\n Requirements:\n - pandas\n - collections.Counter\n\n Example:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 5}, {'x': 2, 'y': 1, 'z': 7}]\n >>> print(f_744(data))\n {'x': Counter({1: 1, 3: 1, 2: 1}), 'y': Counter({10: 1, 15: 1, 1: 1}), 'z': Counter({5: 2, 7: 1})}\n >>> data = [{'x': 2, 'y': 10}, {'y': 15, 'z': 5}, {'x': 2, 'z': 7}]\n >>> print(f_744(data))\n {'x': Counter({2.0: 2}), 'y': Counter({10.0: 1, 15.0: 1}), 'z': Counter({5.0: 1, 7.0: 1})}\n \"\"\"", "canonical_solution": " df = pd.DataFrame(d)\n counts = {}\n\n for key in ['x', 'y', 'z']:\n if key in df.columns:\n counts[key] = Counter(df[key].dropna().tolist())\n else:\n counts[key] = Counter()\n\n return counts", "test": "class TestCases(unittest.TestCase):\n def test_empty_list(self):\n self.assertEqual(f_744([]), {'x': Counter(), 'y': Counter(), 'z': Counter()})\n def test_all_keys_present(self):\n data = [{'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 3, 'z': 2}]\n expected = {'x': Counter({1: 2}), 'y': Counter({2: 1, 3: 1}), 'z': Counter({3: 1, 2: 1})}\n self.assertEqual(f_744(data), expected)\n def test_missing_keys(self):\n data = [{'x': 1}, {'y': 2}, {'z': 3}]\n expected = {'x': Counter({1: 1}), 'y': Counter({2: 1}), 'z': Counter({3: 1})}\n self.assertEqual(f_744(data), expected)\n def test_duplicate_values(self):\n data = [{'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 2}]\n expected = {'x': Counter({1: 3}), 'y': Counter({2: 3}), 'z': Counter({3: 2})}\n self.assertEqual(f_744(data), expected)\n def test_mixed_data_types(self):\n data = [{'x': 1, 'y': 'a', 'z': 3.5}, {'x': '1', 'y': 'a', 'z': 3.5}]\n expected = {'x': Counter({1: 1, '1': 1}), 'y': Counter({'a': 2}), 'z': Counter({3.5: 2})}\n self.assertEqual(f_744(data), expected)", "apis": ["pandas.DataFrame", "collections.Counter"], "libs": ["pandas", "collections"], "doc": {"description": ["Count the occurrence of values with the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d.\""], "note": [], "params": ["d (list): A list of dictionaries."], "returns": ["dict: A dictionary with keys as 'x', 'y', and 'z' and values as Counter objects."], "reqs": ["pandas", "collections.Counter"], "raises": [], "example": [">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 5}, {'x': 2, 'y': 1, 'z': 7}]", ">>> print(f_744(data))", "{'x': Counter({1: 1, 3: 1, 2: 1}), 'y': Counter({10: 1, 15: 1, 1: 1}), 'z': Counter({5: 2, 7: 1})}", ">>> data = [{'x': 2, 'y': 10}, {'y': 15, 'z': 5}, {'x': 2, 'z': 7}]", ">>> print(f_744(data))", "{'x': Counter({2.0: 2}), 'y': Counter({10.0: 1, 15.0: 1}), 'z': Counter({5.0: 1, 7.0: 1})}"]}} -{"task_id": "f_841", "prompt": "import urllib.request\nimport os\nimport hashlib\nimport tarfile\n\n# Constants\nTARGET_TAR_FILE = \"downloaded_files.tar.gz\"\nEXPECTED_MD5_CHECKSUM = \"d41d8cd98f00b204e9800998ecf8427e\"\n\n\ndef f_841(url):\n \"\"\"\n Downloads a tar.gz file from a specified URL, then validates its MD5 checksum against a predefined expected value.\n If the checksum matches, it extracts the contents of the tar.gz file. Otherwise, it deletes the downloaded file.\n\n Parameters:\n url (str): The URL from which to download the tar.gz file.\n\n Returns:\n bool: Returns True if the file is successfully downloaded, its MD5 checksum matches the expected value, and\n it is extracted. Returns False if the checksum does not match the expected value or if the download fails.\n\n Requirements:\n - urllib.request\n - hashlib\n - tarfile\n - os\n\n Example:\n >>> f_841('http://example.com/files.tar.gz')\n True\n \"\"\"", "canonical_solution": " try:\n urllib.request.urlretrieve(url, TARGET_TAR_FILE)\n except Exception as e:\n print(e)\n return False\n\n md5_hash = hashlib.md5()\n with open(TARGET_TAR_FILE, \"rb\") as f:\n for byte_block in iter(lambda: f.read(4096), b\"\"):\n md5_hash.update(byte_block)\n if md5_hash.hexdigest() != EXPECTED_MD5_CHECKSUM:\n os.remove(TARGET_TAR_FILE)\n return False\n\n with tarfile.open(TARGET_TAR_FILE, \"r:gz\") as tar_ref:\n tar_ref.extractall()\n\n os.remove(TARGET_TAR_FILE)\n\n return True", "test": "import unittest\nfrom unittest.mock import patch\nimport urllib.request\nimport hashlib\nimport os\n# Constants from the f_841 function\nTARGET_TAR_FILE = \"downloaded_files.tar.gz\"\nEXPECTED_MD5_CHECKSUM = \"d41d8cd98f00b204e9800998ecf8427e\"\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_841 function.\"\"\"\n def setUp(self):\n self.valid_url = \"http://example.com/valid.tar.gz\"\n self.invalid_checksum_url = \"http://example.com/invalid_checksum.tar.gz\"\n # Create a minimal tar.gz file to simulate download\n with tarfile.open(TARGET_TAR_FILE, \"w:gz\") as tar:\n tar.add(__file__, arcname=os.path.basename(__file__))\n def test_valid_file(self):\n \"\"\"Test that a valid file is downloaded, its checksum is validated, and it is extracted.\"\"\"\n with patch(\"urllib.request.urlretrieve\"), patch(\"hashlib.md5\") as mock_md5:\n mock_md5.return_value.hexdigest.return_value = EXPECTED_MD5_CHECKSUM\n result = f_841(self.valid_url)\n self.assertTrue(result)\n self.assertFalse(os.path.exists(TARGET_TAR_FILE))\n def test_invalid_checksum_valid_format(self):\n \"\"\"Test that a file with an invalid checksum is not extracted.\"\"\"\n with patch(\"urllib.request.urlretrieve\"), patch(\"hashlib.md5\") as mock_md5:\n mock_md5.return_value.hexdigest.return_value = \"invalidchecksum\"\n result = f_841(self.invalid_checksum_url)\n self.assertFalse(result)\n self.assertFalse(os.path.exists(TARGET_TAR_FILE))\n def test_download_failure(self):\n \"\"\"Test that a file that fails to download is not extracted.\"\"\"\n with patch(\n \"urllib.request.urlretrieve\", side_effect=Exception(\"Download failed\")\n ):\n result = f_841(self.valid_url)\n self.assertFalse(result)\n def test_file_removal_after_failure(self):\n \"\"\"Test that a file that fails to download is removed.\"\"\"\n with patch(\"urllib.request.urlretrieve\"), patch(\"hashlib.md5\") as mock_md5:\n mock_md5.return_value.hexdigest.return_value = \"invalidchecksum\"\n f_841(self.invalid_checksum_url)\n self.assertFalse(os.path.exists(TARGET_TAR_FILE))\n def test_extraction_success(self):\n \"\"\"Test that a file is extracted if its checksum is valid.\"\"\"\n with patch(\"urllib.request.urlretrieve\"), patch(\"hashlib.md5\") as mock_md5:\n mock_md5.return_value.hexdigest.return_value = EXPECTED_MD5_CHECKSUM\n result = f_841(self.valid_url)\n self.assertTrue(result)\n def tearDown(self):\n # Clean up any created files\n if os.path.exists(TARGET_TAR_FILE):\n os.remove(TARGET_TAR_FILE)", "apis": ["urllib.request.urlretrieve", "os.remove", "hashlib.md5", "tarfile.open", "urllib.request"], "libs": ["urllib", "hashlib", "os", "tarfile"], "doc": {"description": ["Downloads a tar.gz file from a specified URL, then validates its MD5 checksum against a predefined expected value.", "If the checksum matches, it extracts the contents of the tar.gz file. Otherwise, it deletes the downloaded file."], "note": [], "params": ["url (str): The URL from which to download the tar.gz file."], "returns": ["bool: Returns True if the file is successfully downloaded, its MD5 checksum matches the expected value, and", "it is extracted. Returns False if the checksum does not match the expected value or if the download fails."], "reqs": ["urllib.request", "hashlib", "tarfile", "os"], "raises": [], "example": [">>> f_841('http://example.com/files.tar.gz')", "True"]}} -{"task_id": "f_396", "prompt": "import numpy as np\nimport pandas as pd\nfrom datetime import datetime\n\n\ndef f_396(\n days_in_past=7, stock_names=[\"AAPL\", \"GOOGL\", \"MSFT\", \"AMZN\", \"FB\"], random_seed=0\n):\n \"\"\"\n Create a DataFrame of stock prices for a specified number of days in the past using random data.\n\n Parameters:\n - days_in_past (int, optional): The number of days in the past for which we want stock data.\n Must be positive. Defaults to 7.\n - stock_names (list of str, optional): The list of stock names for which we want data.\n Must not be empty. Defaults to [\"AAPL\", \"GOOGL\", \"MSFT\", \"AMZN\", \"FB\"].\n - random_seed (int, optional): The seed for random number generation to ensure reproducibility. Defaults to 0.\n\n Returns:\n DataFrame: A pandas DataFrame containing random stock prices for the specified number of days.\n Prices are floats in [0.0,1.0).\n\n Requirements:\n - datetime.datetime\n - pandas\n - numpy\n\n Example:\n >>> df = f_396(5, random_seed=42)\n >>> type(df)\n \n >>> print(df.head(1))\n AAPL GOOGL MSFT AMZN FB\n 2024-03-30 37.454012 95.071431 73.199394 59.865848 15.601864\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n\n if not isinstance(days_in_past, int) or days_in_past <= 0:\n raise ValueError(\"days_in_past must be a positive integer.\")\n if not stock_names or not all(isinstance(name, str) for name in stock_names):\n raise ValueError(\"stock_names must be a list of strings and cannot be empty.\")\n\n dates = pd.date_range(end=datetime.now().date(), periods=days_in_past)\n prices = np.random.rand(days_in_past, len(stock_names)) * 100\n df = pd.DataFrame(prices, columns=stock_names, index=dates)\n\n return df", "test": "import unittest\nfrom datetime import datetime\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n DAYS_IN_PAST = 7\n STOCK_NAMES = [\"AAPL\", \"GOOGL\", \"MSFT\", \"AMZN\", \"FB\"]\n def test_case_1(self):\n # Test with default DAYS_IN_PAST value and random seed\n df = f_396(random_seed=42)\n self.assertEqual(\n df.shape[0],\n self.DAYS_IN_PAST,\n \"Number of rows should be equal to days_in_past.\",\n )\n self.assertEqual(\n list(df.columns), self.STOCK_NAMES, \"Columns should match STOCK_NAMES.\"\n )\n self.assertEqual(\n df.index[-1].date(),\n datetime.now().date(),\n \"Last date should be today's date.\",\n )\n self.assertTrue(\n all(df.applymap(lambda x: isinstance(x, (int, float)))),\n \"All values should be numeric.\",\n )\n def test_case_2(self):\n # Test with 1 day in the past (Today's stock prices) and random seed\n df = f_396(1, random_seed=42)\n self.assertEqual(df.shape[0], 1, \"Number of rows should be 1.\")\n self.assertEqual(\n list(df.columns), self.STOCK_NAMES, \"Columns should match STOCK_NAMES.\"\n )\n self.assertEqual(\n df.index[-1].date(),\n datetime.now().date(),\n \"Last date should be today's date.\",\n )\n self.assertTrue(\n all(df.applymap(lambda x: isinstance(x, (int, float)))),\n \"All values should be numeric.\",\n )\n def test_case_3(self):\n # Test with 10 days in the past and random seed\n df = f_396(10, random_seed=42)\n self.assertEqual(df.shape[0], 10, \"Number of rows should be 10.\")\n self.assertEqual(\n list(df.columns), self.STOCK_NAMES, \"Columns should match STOCK_NAMES.\"\n )\n self.assertEqual(\n df.index[-1].date(),\n datetime.now().date(),\n \"Last date should be today's date.\",\n )\n self.assertTrue(\n all(df.applymap(lambda x: isinstance(x, (int, float)))),\n \"All values should be numeric.\",\n )\n def test_case_4(self):\n # Test invalid days in the past\n with self.assertRaises(ValueError):\n f_396(days_in_past=-1)\n with self.assertRaises(ValueError):\n f_396(days_in_past=0)\n with self.assertRaises(ValueError):\n f_396(days_in_past=2.5)\n def test_case_5(self):\n # Test empty and invalid stock names\n with self.assertRaises(ValueError):\n f_396(stock_names=[])\n with self.assertRaises(ValueError):\n f_396(stock_names=[\"AAPL\", 123, None])\n def test_case_6(self):\n # Test random seed\n df1a = f_396(random_seed=42)\n df1b = f_396(random_seed=42)\n df2 = f_396(random_seed=99)\n pd.testing.assert_frame_equal(df1a, df1b)\n self.assertFalse(df1a.equals(df2))\n self.assertFalse(df1b.equals(df2))\n def test_case_7(self):\n # Test larger days_in_the_past\n df = f_396(days_in_past=366)\n self.assertEqual(df.shape[0], 366)\n def test_case_8(self):\n # Test single stock name\n df = f_396(stock_names=[\"ABC\"])\n self.assertTrue(\"ABC\" in df.columns)", "apis": ["numpy.random", "numpy.random.seed", "datetime.datetime.now", "pandas.date_range", "numpy.random.rand", "pandas.DataFrame"], "libs": ["numpy", "pandas", "datetime"], "doc": {"description": ["Create a DataFrame of stock prices for a specified number of days in the past using random data."], "note": [], "params": ["days_in_past (int, optional): The number of days in the past for which we want stock data.", "Must be positive. Defaults to 7.", "stock_names (list of str, optional): The list of stock names for which we want data.", "Must not be empty. Defaults to [\"AAPL\", \"GOOGL\", \"MSFT\", \"AMZN\", \"FB\"].", "random_seed (int, optional): The seed for random number generation to ensure reproducibility. Defaults to 0."], "returns": ["DataFrame: A pandas DataFrame containing random stock prices for the specified number of days.", "Prices are floats in [0.0,1.0)."], "reqs": ["datetime.datetime", "pandas", "numpy"], "raises": [], "example": [">>> df = f_396(5, random_seed=42)", ">>> type(df)", "", ">>> print(df.head(1))", "AAPL GOOGL MSFT AMZN FB", "2024-03-30 37.454012 95.071431 73.199394 59.865848 15.601864"]}} -{"task_id": "f_539", "prompt": "import pandas as pd\nimport json\nimport os\nimport shutil\n\ndef f_539(path):\n \"\"\"\n Processes JSON files in a directory. The function reads each JSON file alphabetically into a DataFrame and inserts a \"Source\" column that specifies the filename. The processed files are then moved to a \"processed\" subdirectory.\n \n Parameters:\n - path (str): The path of the directory containing the JSON files.\n \n Returns:\n - df (pandas.DataFrame): A DataFrame containing the data from all processed files.\n\n Requirements:\n - pandas\n - json\n - os\n - shutil\n \n Example:\n >>> import os\n >>> import shutil\n >>> if os.path.exists('data'):\n ... shutil.rmtree('data')\n >>> os.mkdir('data')\n >>> with open('data/a.json', 'w') as f:\n ... f.write('[{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}]')\n ...\n 36\n >>> with open('data/b.json', 'w') as f:\n ... f.write('[{\"a\": 5, \"b\": 6}, {\"a\": 7, \"b\": 8}]')\n ...\n 36\n >>> df = f_539('data')\n >>> print(df)\n a b source\n 0 5 6 b.json\n 1 7 8 b.json\n 0 1 2 a.json\n 1 3 4 a.json\n >>> shutil.rmtree('data')\n \"\"\"", "canonical_solution": "\n df = pd.DataFrame()\n processed_path = os.path.join(path, 'processed')\n\n if not os.path.exists(processed_path):\n os.makedirs(processed_path)\n\n for filename in os.listdir(path):\n if filename.endswith('.json'):\n file_path = os.path.join(path, filename)\n with open(file_path, 'r') as file:\n data = json.load(file)\n if isinstance(data, dict):\n data = [data] # Wrap scalar values in a list\n temp_df = pd.DataFrame(data)\n temp_df['source'] = filename\n df = pd.concat([df, temp_df])\n\n shutil.move(file_path, processed_path)\n\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n @staticmethod\n def create_json_files(directory, filenames, contents):\n \"\"\"\n Helper function to create JSON files.\n \"\"\"\n if not os.path.exists(directory):\n os.makedirs(directory)\n for filename, content in zip(filenames, contents):\n with open(os.path.join(directory, filename), 'w') as f:\n json.dump(content, f)\n \n def test_basic_operation(self):\n \"\"\"\n Test basic operation with two files.\n \"\"\"\n dir = './test_data_1'\n self.create_json_files(dir, ['a.json', 'b.json'], \n [[{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}], [{\"a\": 5, \"b\": 6}, {\"a\": 7, \"b\": 8}]])\n df = f_539(dir)\n self.assertEqual(len(df), 4)\n shutil.rmtree(dir)\n \n def test_empty_directory(self):\n \"\"\"\n Test operation on an empty directory.\n \"\"\"\n dir = './test_data_2'\n os.makedirs(dir)\n df = f_539(dir)\n self.assertTrue(df.empty)\n shutil.rmtree(dir)\n \n def test_non_json_files(self):\n \"\"\"\n Test operation with non-JSON files in the directory.\n \"\"\"\n dir = './test_data_3'\n self.create_json_files(dir, ['a.json', 'b.txt'], \n [[{\"a\": 1, \"b\": 2}], []])\n df = f_539(dir)\n self.assertEqual(len(df), 1)\n shutil.rmtree(dir)\n \n def test_single_file(self):\n \"\"\"\n Test operation with a single JSON file.\n \"\"\"\n dir = './test_data_4'\n self.create_json_files(dir, ['a.json'], \n [[{\"a\": 1, \"b\": 2}]])\n df = f_539(dir)\n self.assertEqual(len(df), 1)\n shutil.rmtree(dir)\n \n def test_with_empty_json_file(self):\n \"\"\"\n Test operation with an empty JSON file.\n \"\"\"\n dir = './test_data_5'\n self.create_json_files(dir, ['a.json'], \n [[]])\n df = f_539(dir)\n self.assertTrue(df.empty)\n shutil.rmtree(dir)", "apis": ["os.path.exists", "os.listdir", "shutil.move", "pandas.concat", "pandas.DataFrame", "json.load", "os.path", "os.makedirs", "os.path.join"], "libs": ["pandas", "json", "shutil", "os"], "doc": {"description": ["Processes JSON files in a directory. The function reads each JSON file alphabetically into a DataFrame and inserts a \"Source\" column that specifies the filename. The processed files are then moved to a \"processed\" subdirectory."], "note": [], "params": ["path (str): The path of the directory containing the JSON files."], "returns": ["df (pandas.DataFrame): A DataFrame containing the data from all processed files."], "reqs": ["pandas", "json", "os", "shutil"], "raises": [], "example": [">>> import os", ">>> import shutil", ">>> if os.path.exists('data'):", "... shutil.rmtree('data')", ">>> os.mkdir('data')", ">>> with open('data/a.json', 'w') as f:", "... f.write('[{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}]')", "...", "36", ">>> with open('data/b.json', 'w') as f:", "... f.write('[{\"a\": 5, \"b\": 6}, {\"a\": 7, \"b\": 8}]')", "...", "36", ">>> df = f_539('data')", ">>> print(df)", "a b source", "0 5 6 b.json", "1 7 8 b.json", "0 1 2 a.json", "1 3 4 a.json", ">>> shutil.rmtree('data')"]}} +{"task_id": "f_891", "prompt": "from datetime import datetime\nimport random\nimport matplotlib.pyplot as plt\n\n\ndef f_891(date_str):\n \"\"\"\n Generates a list of random integers, where the count of integers equals the day of the month in the\n provided date, then generates a line plot of these integers and returns the Axes object of the plot.\n\n Parameters:\n - date_str (str): The date string in \"yyyy-mm-dd\" format.\n\n Returns:\n - matplotlib.axes.Axes: The Axes object containing the plot.\n\n Requirements:\n - datetime.datetime\n - random\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_891('2023-06-15')\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " date = datetime.strptime(date_str, \"%Y-%m-%d\")\n num_of_values = date.day\n random_values = [random.randint(1, 100) for _ in range(num_of_values)]\n _, ax = plt.subplots()\n ax.plot(random_values)\n return ax", "test": "import unittest\nimport matplotlib.axes\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_891.\"\"\"\n def test_mid_month(self):\n \"\"\"\n Test the function with a mid-month date.\n Checks if the generated plot has 15 data points for a date like '2023-06-15'.\n \"\"\"\n ax = f_891(\"2023-06-15\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines[0].get_ydata()), 15)\n def test_beginning_of_month(self):\n \"\"\"\n Test the function with a date at the beginning of the month.\n Checks if the plot has 1 data point for a date like '2023-06-01'.\n \"\"\"\n ax = f_891(\"2023-06-01\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines[0].get_ydata()), 1)\n def test_end_of_month(self):\n \"\"\"\n Test the function with a date at the end of the month.\n Checks if the plot has 31 data points for a date like '2023-07-31'.\n \"\"\"\n ax = f_891(\"2023-07-31\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines[0].get_ydata()), 31)\n def test_leap_year(self):\n \"\"\"\n Test the function with a leap year date.\n Checks if the plot has 29 data points for a leap year date like '2024-02-29'.\n \"\"\"\n ax = f_891(\"2024-02-29\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines[0].get_ydata()), 29)\n def test_invalid_date(self):\n \"\"\"\n Test the function with an invalid date format.\n Expects a ValueError to be raised for an incorrectly formatted date.\n \"\"\"\n with self.assertRaises(ValueError):\n f_891(\"2023/06/15\")\n def tearDown(self):\n plt.clf()", "apis": ["matplotlib.pyplot.subplots", "random.randint", "datetime.datetime.strptime"], "libs": ["random", "matplotlib", "datetime"], "doc": {"description": ["Generates a list of random integers, where the count of integers equals the day of the month in the", "provided date, then generates a line plot of these integers and returns the Axes object of the plot."], "note": [], "params": ["date_str (str): The date string in \"yyyy-mm-dd\" format."], "returns": ["matplotlib.axes.Axes: The Axes object containing the plot."], "reqs": ["datetime.datetime", "random", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_891('2023-06-15')", ">>> type(ax)", ""]}} +{"task_id": "f_744", "prompt": "import pandas as pd\nfrom collections import Counter\nimport unittest\n\ndef f_744(d):\n \"\"\"\n Count the occurrence of values with the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d.\"\n\n Parameters:\n d (list): A list of dictionaries.\n\n Returns:\n dict: A dictionary with keys as 'x', 'y', and 'z' and values as Counter objects.\n\n Requirements:\n - pandas\n - collections.Counter\n\n Example:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 5}, {'x': 2, 'y': 1, 'z': 7}]\n >>> print(f_744(data))\n {'x': Counter({1: 1, 3: 1, 2: 1}), 'y': Counter({10: 1, 15: 1, 1: 1}), 'z': Counter({5: 2, 7: 1})}\n >>> data = [{'x': 2, 'y': 10}, {'y': 15, 'z': 5}, {'x': 2, 'z': 7}]\n >>> print(f_744(data))\n {'x': Counter({2.0: 2}), 'y': Counter({10.0: 1, 15.0: 1}), 'z': Counter({5.0: 1, 7.0: 1})}\n \"\"\"", "canonical_solution": " df = pd.DataFrame(d)\n counts = {}\n\n for key in ['x', 'y', 'z']:\n if key in df.columns:\n counts[key] = Counter(df[key].dropna().tolist())\n else:\n counts[key] = Counter()\n\n return counts", "test": "class TestCases(unittest.TestCase):\n def test_empty_list(self):\n self.assertEqual(f_744([]), {'x': Counter(), 'y': Counter(), 'z': Counter()})\n def test_all_keys_present(self):\n data = [{'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 3, 'z': 2}]\n expected = {'x': Counter({1: 2}), 'y': Counter({2: 1, 3: 1}), 'z': Counter({3: 1, 2: 1})}\n self.assertEqual(f_744(data), expected)\n def test_missing_keys(self):\n data = [{'x': 1}, {'y': 2}, {'z': 3}]\n expected = {'x': Counter({1: 1}), 'y': Counter({2: 1}), 'z': Counter({3: 1})}\n self.assertEqual(f_744(data), expected)\n def test_duplicate_values(self):\n data = [{'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 2, 'z': 3}, {'x': 1, 'y': 2}]\n expected = {'x': Counter({1: 3}), 'y': Counter({2: 3}), 'z': Counter({3: 2})}\n self.assertEqual(f_744(data), expected)\n def test_mixed_data_types(self):\n data = [{'x': 1, 'y': 'a', 'z': 3.5}, {'x': '1', 'y': 'a', 'z': 3.5}]\n expected = {'x': Counter({1: 1, '1': 1}), 'y': Counter({'a': 2}), 'z': Counter({3.5: 2})}\n self.assertEqual(f_744(data), expected)", "apis": ["pandas.DataFrame", "collections.Counter"], "libs": ["collections", "pandas"], "doc": {"description": ["Count the occurrence of values with the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d.\""], "note": [], "params": ["d (list): A list of dictionaries."], "returns": ["dict: A dictionary with keys as 'x', 'y', and 'z' and values as Counter objects."], "reqs": ["pandas", "collections.Counter"], "raises": [], "example": [">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 5}, {'x': 2, 'y': 1, 'z': 7}]", ">>> print(f_744(data))", "{'x': Counter({1: 1, 3: 1, 2: 1}), 'y': Counter({10: 1, 15: 1, 1: 1}), 'z': Counter({5: 2, 7: 1})}", ">>> data = [{'x': 2, 'y': 10}, {'y': 15, 'z': 5}, {'x': 2, 'z': 7}]", ">>> print(f_744(data))", "{'x': Counter({2.0: 2}), 'y': Counter({10.0: 1, 15.0: 1}), 'z': Counter({5.0: 1, 7.0: 1})}"]}} +{"task_id": "f_841", "prompt": "import urllib.request\nimport os\nimport hashlib\nimport tarfile\n\n# Constants\nTARGET_TAR_FILE = \"downloaded_files.tar.gz\"\nEXPECTED_MD5_CHECKSUM = \"d41d8cd98f00b204e9800998ecf8427e\"\n\n\ndef f_841(url):\n \"\"\"\n Downloads a tar.gz file from a specified URL, then validates its MD5 checksum against a predefined expected value.\n If the checksum matches, it extracts the contents of the tar.gz file. Otherwise, it deletes the downloaded file.\n\n Parameters:\n url (str): The URL from which to download the tar.gz file.\n\n Returns:\n bool: Returns True if the file is successfully downloaded, its MD5 checksum matches the expected value, and\n it is extracted. Returns False if the checksum does not match the expected value or if the download fails.\n\n Requirements:\n - urllib.request\n - hashlib\n - tarfile\n - os\n\n Example:\n >>> f_841('http://example.com/files.tar.gz')\n True\n \"\"\"", "canonical_solution": " try:\n urllib.request.urlretrieve(url, TARGET_TAR_FILE)\n except Exception as e:\n print(e)\n return False\n\n md5_hash = hashlib.md5()\n with open(TARGET_TAR_FILE, \"rb\") as f:\n for byte_block in iter(lambda: f.read(4096), b\"\"):\n md5_hash.update(byte_block)\n if md5_hash.hexdigest() != EXPECTED_MD5_CHECKSUM:\n os.remove(TARGET_TAR_FILE)\n return False\n\n with tarfile.open(TARGET_TAR_FILE, \"r:gz\") as tar_ref:\n tar_ref.extractall()\n\n os.remove(TARGET_TAR_FILE)\n\n return True", "test": "import unittest\nfrom unittest.mock import patch\nimport urllib.request\nimport hashlib\nimport os\n# Constants from the f_841 function\nTARGET_TAR_FILE = \"downloaded_files.tar.gz\"\nEXPECTED_MD5_CHECKSUM = \"d41d8cd98f00b204e9800998ecf8427e\"\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_841 function.\"\"\"\n def setUp(self):\n self.valid_url = \"http://example.com/valid.tar.gz\"\n self.invalid_checksum_url = \"http://example.com/invalid_checksum.tar.gz\"\n # Create a minimal tar.gz file to simulate download\n with tarfile.open(TARGET_TAR_FILE, \"w:gz\") as tar:\n tar.add(__file__, arcname=os.path.basename(__file__))\n def test_valid_file(self):\n \"\"\"Test that a valid file is downloaded, its checksum is validated, and it is extracted.\"\"\"\n with patch(\"urllib.request.urlretrieve\"), patch(\"hashlib.md5\") as mock_md5:\n mock_md5.return_value.hexdigest.return_value = EXPECTED_MD5_CHECKSUM\n result = f_841(self.valid_url)\n self.assertTrue(result)\n self.assertFalse(os.path.exists(TARGET_TAR_FILE))\n def test_invalid_checksum_valid_format(self):\n \"\"\"Test that a file with an invalid checksum is not extracted.\"\"\"\n with patch(\"urllib.request.urlretrieve\"), patch(\"hashlib.md5\") as mock_md5:\n mock_md5.return_value.hexdigest.return_value = \"invalidchecksum\"\n result = f_841(self.invalid_checksum_url)\n self.assertFalse(result)\n self.assertFalse(os.path.exists(TARGET_TAR_FILE))\n def test_download_failure(self):\n \"\"\"Test that a file that fails to download is not extracted.\"\"\"\n with patch(\n \"urllib.request.urlretrieve\", side_effect=Exception(\"Download failed\")\n ):\n result = f_841(self.valid_url)\n self.assertFalse(result)\n def test_file_removal_after_failure(self):\n \"\"\"Test that a file that fails to download is removed.\"\"\"\n with patch(\"urllib.request.urlretrieve\"), patch(\"hashlib.md5\") as mock_md5:\n mock_md5.return_value.hexdigest.return_value = \"invalidchecksum\"\n f_841(self.invalid_checksum_url)\n self.assertFalse(os.path.exists(TARGET_TAR_FILE))\n def test_extraction_success(self):\n \"\"\"Test that a file is extracted if its checksum is valid.\"\"\"\n with patch(\"urllib.request.urlretrieve\"), patch(\"hashlib.md5\") as mock_md5:\n mock_md5.return_value.hexdigest.return_value = EXPECTED_MD5_CHECKSUM\n result = f_841(self.valid_url)\n self.assertTrue(result)\n def tearDown(self):\n # Clean up any created files\n if os.path.exists(TARGET_TAR_FILE):\n os.remove(TARGET_TAR_FILE)", "apis": ["urllib.request", "hashlib.md5", "os.remove", "tarfile.open", "urllib.request.urlretrieve"], "libs": ["os", "urllib", "hashlib", "tarfile"], "doc": {"description": ["Downloads a tar.gz file from a specified URL, then validates its MD5 checksum against a predefined expected value.", "If the checksum matches, it extracts the contents of the tar.gz file. Otherwise, it deletes the downloaded file."], "note": [], "params": ["url (str): The URL from which to download the tar.gz file."], "returns": ["bool: Returns True if the file is successfully downloaded, its MD5 checksum matches the expected value, and", "it is extracted. Returns False if the checksum does not match the expected value or if the download fails."], "reqs": ["urllib.request", "hashlib", "tarfile", "os"], "raises": [], "example": [">>> f_841('http://example.com/files.tar.gz')", "True"]}} +{"task_id": "f_396", "prompt": "import numpy as np\nimport pandas as pd\nfrom datetime import datetime\n\n\ndef f_396(\n days_in_past=7, stock_names=[\"AAPL\", \"GOOGL\", \"MSFT\", \"AMZN\", \"FB\"], random_seed=0\n):\n \"\"\"\n Create a DataFrame of stock prices for a specified number of days in the past using random data.\n\n Parameters:\n - days_in_past (int, optional): The number of days in the past for which we want stock data.\n Must be positive. Defaults to 7.\n - stock_names (list of str, optional): The list of stock names for which we want data.\n Must not be empty. Defaults to [\"AAPL\", \"GOOGL\", \"MSFT\", \"AMZN\", \"FB\"].\n - random_seed (int, optional): The seed for random number generation to ensure reproducibility. Defaults to 0.\n\n Returns:\n DataFrame: A pandas DataFrame containing random stock prices for the specified number of days.\n Prices are floats in [0.0,1.0).\n\n Requirements:\n - datetime.datetime\n - pandas\n - numpy\n\n Example:\n >>> df = f_396(5, random_seed=42)\n >>> type(df)\n \n >>> print(df.head(1))\n AAPL GOOGL MSFT AMZN FB\n 2024-03-30 37.454012 95.071431 73.199394 59.865848 15.601864\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n\n if not isinstance(days_in_past, int) or days_in_past <= 0:\n raise ValueError(\"days_in_past must be a positive integer.\")\n if not stock_names or not all(isinstance(name, str) for name in stock_names):\n raise ValueError(\"stock_names must be a list of strings and cannot be empty.\")\n\n dates = pd.date_range(end=datetime.now().date(), periods=days_in_past)\n prices = np.random.rand(days_in_past, len(stock_names)) * 100\n df = pd.DataFrame(prices, columns=stock_names, index=dates)\n\n return df", "test": "import unittest\nfrom datetime import datetime\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n DAYS_IN_PAST = 7\n STOCK_NAMES = [\"AAPL\", \"GOOGL\", \"MSFT\", \"AMZN\", \"FB\"]\n def test_case_1(self):\n # Test with default DAYS_IN_PAST value and random seed\n df = f_396(random_seed=42)\n self.assertEqual(\n df.shape[0],\n self.DAYS_IN_PAST,\n \"Number of rows should be equal to days_in_past.\",\n )\n self.assertEqual(\n list(df.columns), self.STOCK_NAMES, \"Columns should match STOCK_NAMES.\"\n )\n self.assertEqual(\n df.index[-1].date(),\n datetime.now().date(),\n \"Last date should be today's date.\",\n )\n self.assertTrue(\n all(df.applymap(lambda x: isinstance(x, (int, float)))),\n \"All values should be numeric.\",\n )\n def test_case_2(self):\n # Test with 1 day in the past (Today's stock prices) and random seed\n df = f_396(1, random_seed=42)\n self.assertEqual(df.shape[0], 1, \"Number of rows should be 1.\")\n self.assertEqual(\n list(df.columns), self.STOCK_NAMES, \"Columns should match STOCK_NAMES.\"\n )\n self.assertEqual(\n df.index[-1].date(),\n datetime.now().date(),\n \"Last date should be today's date.\",\n )\n self.assertTrue(\n all(df.applymap(lambda x: isinstance(x, (int, float)))),\n \"All values should be numeric.\",\n )\n def test_case_3(self):\n # Test with 10 days in the past and random seed\n df = f_396(10, random_seed=42)\n self.assertEqual(df.shape[0], 10, \"Number of rows should be 10.\")\n self.assertEqual(\n list(df.columns), self.STOCK_NAMES, \"Columns should match STOCK_NAMES.\"\n )\n self.assertEqual(\n df.index[-1].date(),\n datetime.now().date(),\n \"Last date should be today's date.\",\n )\n self.assertTrue(\n all(df.applymap(lambda x: isinstance(x, (int, float)))),\n \"All values should be numeric.\",\n )\n def test_case_4(self):\n # Test invalid days in the past\n with self.assertRaises(ValueError):\n f_396(days_in_past=-1)\n with self.assertRaises(ValueError):\n f_396(days_in_past=0)\n with self.assertRaises(ValueError):\n f_396(days_in_past=2.5)\n def test_case_5(self):\n # Test empty and invalid stock names\n with self.assertRaises(ValueError):\n f_396(stock_names=[])\n with self.assertRaises(ValueError):\n f_396(stock_names=[\"AAPL\", 123, None])\n def test_case_6(self):\n # Test random seed\n df1a = f_396(random_seed=42)\n df1b = f_396(random_seed=42)\n df2 = f_396(random_seed=99)\n pd.testing.assert_frame_equal(df1a, df1b)\n self.assertFalse(df1a.equals(df2))\n self.assertFalse(df1b.equals(df2))\n def test_case_7(self):\n # Test larger days_in_the_past\n df = f_396(days_in_past=366)\n self.assertEqual(df.shape[0], 366)\n def test_case_8(self):\n # Test single stock name\n df = f_396(stock_names=[\"ABC\"])\n self.assertTrue(\"ABC\" in df.columns)", "apis": ["pandas.DataFrame", "numpy.random", "datetime.datetime.now", "numpy.random.rand", "pandas.date_range", "numpy.random.seed"], "libs": ["numpy", "pandas", "datetime"], "doc": {"description": ["Create a DataFrame of stock prices for a specified number of days in the past using random data."], "note": [], "params": ["days_in_past (int, optional): The number of days in the past for which we want stock data.", "Must be positive. Defaults to 7.", "stock_names (list of str, optional): The list of stock names for which we want data.", "Must not be empty. Defaults to [\"AAPL\", \"GOOGL\", \"MSFT\", \"AMZN\", \"FB\"].", "random_seed (int, optional): The seed for random number generation to ensure reproducibility. Defaults to 0."], "returns": ["DataFrame: A pandas DataFrame containing random stock prices for the specified number of days.", "Prices are floats in [0.0,1.0)."], "reqs": ["datetime.datetime", "pandas", "numpy"], "raises": [], "example": [">>> df = f_396(5, random_seed=42)", ">>> type(df)", "", ">>> print(df.head(1))", "AAPL GOOGL MSFT AMZN FB", "2024-03-30 37.454012 95.071431 73.199394 59.865848 15.601864"]}} +{"task_id": "f_539", "prompt": "import pandas as pd\nimport json\nimport os\nimport shutil\n\ndef f_539(path):\n \"\"\"\n Processes JSON files in a directory. The function reads each JSON file alphabetically into a DataFrame and inserts a \"Source\" column that specifies the filename. The processed files are then moved to a \"processed\" subdirectory.\n \n Parameters:\n - path (str): The path of the directory containing the JSON files.\n \n Returns:\n - df (pandas.DataFrame): A DataFrame containing the data from all processed files.\n\n Requirements:\n - pandas\n - json\n - os\n - shutil\n \n Example:\n >>> import os\n >>> import shutil\n >>> if os.path.exists('data'):\n ... shutil.rmtree('data')\n >>> os.mkdir('data')\n >>> with open('data/a.json', 'w') as f:\n ... f.write('[{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}]')\n ...\n 36\n >>> with open('data/b.json', 'w') as f:\n ... f.write('[{\"a\": 5, \"b\": 6}, {\"a\": 7, \"b\": 8}]')\n ...\n 36\n >>> df = f_539('data')\n >>> print(df)\n a b source\n 0 5 6 b.json\n 1 7 8 b.json\n 0 1 2 a.json\n 1 3 4 a.json\n >>> shutil.rmtree('data')\n \"\"\"", "canonical_solution": "\n df = pd.DataFrame()\n processed_path = os.path.join(path, 'processed')\n\n if not os.path.exists(processed_path):\n os.makedirs(processed_path)\n\n for filename in os.listdir(path):\n if filename.endswith('.json'):\n file_path = os.path.join(path, filename)\n with open(file_path, 'r') as file:\n data = json.load(file)\n if isinstance(data, dict):\n data = [data] # Wrap scalar values in a list\n temp_df = pd.DataFrame(data)\n temp_df['source'] = filename\n df = pd.concat([df, temp_df])\n\n shutil.move(file_path, processed_path)\n\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n @staticmethod\n def create_json_files(directory, filenames, contents):\n \"\"\"\n Helper function to create JSON files.\n \"\"\"\n if not os.path.exists(directory):\n os.makedirs(directory)\n for filename, content in zip(filenames, contents):\n with open(os.path.join(directory, filename), 'w') as f:\n json.dump(content, f)\n \n def test_basic_operation(self):\n \"\"\"\n Test basic operation with two files.\n \"\"\"\n dir = './test_data_1'\n self.create_json_files(dir, ['a.json', 'b.json'], \n [[{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}], [{\"a\": 5, \"b\": 6}, {\"a\": 7, \"b\": 8}]])\n df = f_539(dir)\n self.assertEqual(len(df), 4)\n shutil.rmtree(dir)\n \n def test_empty_directory(self):\n \"\"\"\n Test operation on an empty directory.\n \"\"\"\n dir = './test_data_2'\n os.makedirs(dir)\n df = f_539(dir)\n self.assertTrue(df.empty)\n shutil.rmtree(dir)\n \n def test_non_json_files(self):\n \"\"\"\n Test operation with non-JSON files in the directory.\n \"\"\"\n dir = './test_data_3'\n self.create_json_files(dir, ['a.json', 'b.txt'], \n [[{\"a\": 1, \"b\": 2}], []])\n df = f_539(dir)\n self.assertEqual(len(df), 1)\n shutil.rmtree(dir)\n \n def test_single_file(self):\n \"\"\"\n Test operation with a single JSON file.\n \"\"\"\n dir = './test_data_4'\n self.create_json_files(dir, ['a.json'], \n [[{\"a\": 1, \"b\": 2}]])\n df = f_539(dir)\n self.assertEqual(len(df), 1)\n shutil.rmtree(dir)\n \n def test_with_empty_json_file(self):\n \"\"\"\n Test operation with an empty JSON file.\n \"\"\"\n dir = './test_data_5'\n self.create_json_files(dir, ['a.json'], \n [[]])\n df = f_539(dir)\n self.assertTrue(df.empty)\n shutil.rmtree(dir)", "apis": ["pandas.DataFrame", "os.makedirs", "os.listdir", "os.path", "os.path.join", "json.load", "os.path.exists", "pandas.concat", "shutil.move"], "libs": ["os", "shutil", "json", "pandas"], "doc": {"description": ["Processes JSON files in a directory. The function reads each JSON file alphabetically into a DataFrame and inserts a \"Source\" column that specifies the filename. The processed files are then moved to a \"processed\" subdirectory."], "note": [], "params": ["path (str): The path of the directory containing the JSON files."], "returns": ["df (pandas.DataFrame): A DataFrame containing the data from all processed files."], "reqs": ["pandas", "json", "os", "shutil"], "raises": [], "example": [">>> import os", ">>> import shutil", ">>> if os.path.exists('data'):", "... shutil.rmtree('data')", ">>> os.mkdir('data')", ">>> with open('data/a.json', 'w') as f:", "... f.write('[{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}]')", "...", "36", ">>> with open('data/b.json', 'w') as f:", "... f.write('[{\"a\": 5, \"b\": 6}, {\"a\": 7, \"b\": 8}]')", "...", "36", ">>> df = f_539('data')", ">>> print(df)", "a b source", "0 5 6 b.json", "1 7 8 b.json", "0 1 2 a.json", "1 3 4 a.json", ">>> shutil.rmtree('data')"]}} {"task_id": "f_924", "prompt": "import pandas as pd\nfrom sklearn.linear_model import LinearRegression\n\nDATA = {\n \"Area_String\": [\"1,000\", \"2,000\", \"3,000\", \"4,000\", \"5,000\"],\n \"Price\": [100, 200, 300, 400, 500],\n}\n\n\ndef f_924(area_string, data=DATA):\n \"\"\"\n Predicts the price based on a given area after training a linear regression model.\n\n Parameters:\n - area_string (str): A string representing the area (in square units) for\n which the price needs to be predicted. The string may contain commas.\n - data (dict): Optional. A dictionary with keys 'Area_String' and 'Price'\n representing area values (as strings) and their corresponding prices. Defaults to a predefined dataset.\n\n Returns:\n - float: The predicted price for the given area.\n\n Requirements:\n - pandas\n - sklearn.linear_model\n\n Example:\n >>> f_924('6,000')\n 600.0\n \"\"\"", "canonical_solution": " # Convert area strings to float and prepare data for the model\n df = pd.DataFrame(data)\n df[\"Area_Float\"] = df[\"Area_String\"].str.replace(\",\", \"\").astype(float)\n\n # Train the linear regression model\n X = df[[\"Area_Float\"]]\n Y = df[\"Price\"]\n model = LinearRegression()\n model.fit(X, Y)\n\n # Predict the price for the given area string\n area_float = float(area_string.replace(\",\", \"\"))\n prediction_data = pd.DataFrame([area_float], columns=[\"Area_Float\"])\n price_predicted = model.predict(prediction_data)\n\n return price_predicted[0]", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_924\"\"\"\n def test_correctness(self):\n \"\"\"Test correctness.\"\"\"\n self.assertAlmostEqual(f_924(\"6,000\"), 600, delta=10)\n self.assertAlmostEqual(f_924(\"7,000\"), 700, delta=10)\n def test_input_formats(self):\n \"\"\"Test input formats.\"\"\"\n self.assertAlmostEqual(f_924(\"6,500\"), 650, delta=10)\n self.assertAlmostEqual(f_924(\"6500\"), 650, delta=10)\n def test_custom_data(self):\n \"\"\"Test custom data.\"\"\"\n custom_data = {\n \"Area_String\": [\"10\", \"20\", \"30\", \"40\", \"50\"],\n \"Price\": [1, 2, 3, 4, 5],\n }\n self.assertAlmostEqual(f_924(\"60\", data=custom_data), 6, delta=0.1)\n def test_existing_area(self):\n \"\"\"Test existing area.\"\"\"\n self.assertAlmostEqual(f_924(\"5,000\"), 500, delta=5)\n def test_large_area(self):\n \"\"\"Test large area.\"\"\"\n self.assertAlmostEqual(f_924(\"100,000\"), 10000, delta=100)", "apis": ["pandas.DataFrame", "sklearn.linear_model.LinearRegression"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Predicts the price based on a given area after training a linear regression model."], "note": [], "params": ["area_string (str): A string representing the area (in square units) for", "which the price needs to be predicted. The string may contain commas.", "data (dict): Optional. A dictionary with keys 'Area_String' and 'Price'", "representing area values (as strings) and their corresponding prices. Defaults to a predefined dataset."], "returns": ["float: The predicted price for the given area."], "reqs": ["pandas", "sklearn.linear_model"], "raises": [], "example": [">>> f_924('6,000')", "600.0"]}} -{"task_id": "f_354", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_354(data: pd.DataFrame) -> (pd.DataFrame, list):\n \"\"\"\n This function takes a pandas DataFrame and standardizes its features using sklearn's StandardScaler,\n which standardizes features by removing the mean and scaling to unit variance.\n After standardization, it draws a histogram for each feature with 20 bins.\n\n Parameters:\n - data (pd.DataFrame): The input data to be standardized and plotted. It is expected to have\n columns named 'Feature1', 'Feature2', 'Feature3', 'Feature4', and 'Feature5'.\n If there are additional data columns, they are ignored.\n\n\n Returns:\n - standardized_data (pd.DataFrame): The standardized data.\n - axes_list (list): A list of matplotlib Axes objects representing the histograms for each feature.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n - sklearn.preprocessing.StandardScaler\n \n Example:\n >>> data = pd.DataFrame({\n ... 'Feature1': [0.5, 0.6, 0.7, 0.8, 0.9],\n ... 'Feature2': [0.1, 0.2, 0.3, 0.4, 0.5],\n ... 'Feature3': [0.9, 0.8, 0.7, 0.6, 0.5],\n ... 'Feature4': [0.5, 0.4, 0.3, 0.2, 0.1],\n ... 'Feature5': [0.1, 0.3, 0.5, 0.7, 0.9]\n ... })\n >>> standardized_data, axes_list = f_354(data)\n >>> type(standardized_data)\n \n >>> axes_list\n [, , , , ]\n >>> type(axes_list[0])\n \n \"\"\"", "canonical_solution": " FEATURES = [\"Feature1\", \"Feature2\", \"Feature3\", \"Feature4\", \"Feature5\"]\n\n scaler = StandardScaler()\n data_standardized = pd.DataFrame(\n scaler.fit_transform(data[FEATURES]), columns=FEATURES\n )\n\n axes_list = []\n for feature in FEATURES:\n fig, ax = plt.subplots()\n ax.hist(data_standardized[feature], bins=20, alpha=0.5)\n ax.set_title(\"Histogram of {}\".format(feature))\n axes_list.append(ax)\n\n return data_standardized, axes_list", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.columns = [\"Feature1\", \"Feature2\", \"Feature3\", \"Feature4\", \"Feature5\"]\n np.random.seed(0)\n def test_case_1(self):\n # Test basic case\n data = pd.DataFrame(\n np.random.rand(100, 5),\n columns=self.columns,\n )\n self.standardized_data_test(data)\n def test_case_2(self):\n # Test standardizing different distribution\n data = pd.DataFrame(\n np.random.exponential(scale=1.0, size=(100, 5)),\n columns=self.columns,\n )\n self.standardized_data_test(data)\n def test_case_3(self):\n # Test standardizing data combined from different distributions\n data_1 = np.random.rand(100, 3)\n data_2 = np.random.exponential(scale=1.0, size=(100, 2))\n data = pd.DataFrame(\n np.hstack((data_1, data_2)),\n columns=self.columns,\n )\n self.standardized_data_test(data)\n def test_case_4(self):\n # Test the function with highly skewed data\n data = pd.DataFrame(\n np.random.chisquare(df=1, size=(100, 5)),\n columns=self.columns,\n )\n standardized_data, _ = f_354(data)\n self.assertTrue(np.isclose(standardized_data.std().values, 1, atol=1e-1).all())\n def test_case_5(self):\n # Test function with a dataframe that has only one row\n data = pd.DataFrame(\n {\n \"Feature1\": [0.1],\n \"Feature2\": [0.2],\n \"Feature3\": [0.3],\n \"Feature4\": [0.4],\n \"Feature5\": [0.5],\n }\n )\n _, axes_list = f_354(data)\n self.assertEqual(len(axes_list), 5)\n def test_case_6(self):\n # Test with columns having identical values across all rows.\n data = pd.DataFrame(\n {\n \"Feature1\": [0.1] * 100,\n \"Feature2\": [0.2] * 100,\n \"Feature3\": [0.3] * 100,\n \"Feature4\": [0.4] * 100,\n \"Feature5\": [0.5] * 100,\n }\n )\n standardized_data, _ = f_354(data)\n # Identical values become NaN after standardization because variance is 0\n expected_zeros = pd.DataFrame(\n 0,\n index=np.arange(100),\n columns=self.columns,\n )\n self.assertTrue(np.isclose(standardized_data, expected_zeros).all().all())\n def test_case_7(self):\n # Test with additional columns not in the expected FEATURES set\n data = pd.DataFrame(\n np.random.rand(100, 7),\n columns=self.columns\n + [\n \"Extra1\",\n \"Extra2\",\n ],\n )\n _, axes_list = f_354(data)\n self.assertEqual(len(axes_list), 5)\n def test_case_8(self):\n # Test with missing columns from the expected FEATURES set\n data = pd.DataFrame(\n np.random.rand(100, 3), columns=[\"Feature1\", \"Feature2\", \"Feature3\"]\n )\n with self.assertRaises(KeyError):\n f_354(data)\n def test_case_9(self):\n # Test should fail when there is invalid input - empty dataframe\n data = pd.DataFrame()\n with self.assertRaises(KeyError):\n f_354(data)\n def test_case_10(self):\n # Test should fail when there is invalid input - NaN\n data = pd.DataFrame(\n {\n \"Feature1\": [np.nan, 0.2, 0.3],\n \"Feature2\": [0.1, np.nan, 0.3],\n \"Feature3\": [0.2, 0.2, np.nan],\n \"Feature4\": [np.nan, 0.4, 0.5],\n \"Feature5\": [0.5, 0.6, np.nan],\n }\n )\n standardized_data, _ = f_354(data)\n self.assertTrue(standardized_data.isnull().any().any())\n def test_case_11(self):\n # Test should fail when there is invalid input - inf\n data = pd.DataFrame(\n {\n \"Feature1\": [np.inf, 0.2, 0.3],\n \"Feature2\": [0.1, -np.inf, 0.3],\n \"Feature3\": [0.2, 0.2, np.inf],\n \"Feature4\": [-np.inf, 0.4, 0.5],\n \"Feature5\": [0.5, 0.6, -np.inf],\n }\n )\n with self.assertRaises(ValueError):\n f_354(data)\n def test_case_12(self):\n # Test the function with non-numeric columns.\n data = pd.DataFrame(\n {\n \"Feature1\": [\"a\", \"b\", \"c\"],\n \"Feature2\": [\"d\", \"e\", \"f\"],\n \"Feature3\": [\"g\", \"h\", \"i\"],\n \"Feature4\": [\"j\", \"k\", \"l\"],\n \"Feature5\": [\"m\", \"n\", \"o\"],\n }\n )\n with self.assertRaises(ValueError):\n f_354(data)\n def test_case_13(self):\n # Function should fail if more than expected number of features (5)\n data = pd.DataFrame(np.random.rand(100, 50))\n with self.assertRaises(KeyError):\n f_354(data)\n def standardized_data_test(self, data):\n np.random.seed(0)\n standardized_data, axes_list = f_354(data)\n # Check if the data is standardized (mean ~ 0 and standard deviation ~ 1)\n self.assertTrue(np.isclose(standardized_data.mean().values, 0, atol=1e-2).all())\n self.assertTrue(np.isclose(standardized_data.std().values, 1, atol=1e-1).all())\n # Check the number of returned histograms\n self.assertEqual(len(axes_list), 5)\n # Check if each histogram is correctly titled\n for ax, feature in zip(axes_list, self.columns):\n self.assertEqual(ax.get_title(), f\"Histogram of {feature}\")\n # Check if histograms have the right number of bins\n for ax in axes_list:\n self.assertEqual(len(ax.patches), 20)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots", "sklearn.preprocessing.StandardScaler"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["This function takes a pandas DataFrame and standardizes its features using sklearn's StandardScaler,", "which standardizes features by removing the mean and scaling to unit variance.", "After standardization, it draws a histogram for each feature with 20 bins."], "note": [], "params": ["data (pd.DataFrame): The input data to be standardized and plotted. It is expected to have", "columns named 'Feature1', 'Feature2', 'Feature3', 'Feature4', and 'Feature5'.", "If there are additional data columns, they are ignored."], "returns": ["standardized_data (pd.DataFrame): The standardized data.", "axes_list (list): A list of matplotlib Axes objects representing the histograms for each feature."], "reqs": ["pandas", "matplotlib.pyplot", "sklearn.preprocessing.StandardScaler"], "raises": [], "example": [">>> data = pd.DataFrame({", "... 'Feature1': [0.5, 0.6, 0.7, 0.8, 0.9],", "... 'Feature2': [0.1, 0.2, 0.3, 0.4, 0.5],", "... 'Feature3': [0.9, 0.8, 0.7, 0.6, 0.5],", "... 'Feature4': [0.5, 0.4, 0.3, 0.2, 0.1],", "... 'Feature5': [0.1, 0.3, 0.5, 0.7, 0.9]", "... })", ">>> standardized_data, axes_list = f_354(data)", ">>> type(standardized_data)", "", ">>> axes_list", "[, , , , ]", ">>> type(axes_list[0])", ""]}} -{"task_id": "f_757", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import zscore\n\ndef f_757(df, z_threshold=2):\n \"\"\"\n Identifies and plots outliers in the 'closing_price' column of a given DataFrame using the Z-Score method.\n \n Parameters:\n df (pandas.DataFrame): The input DataFrame that must contain a column named 'closing_price' with numerical values.\n z_threshold (float, optional): The absolute Z-Score threshold for identifying outliers. Default is 2.\n \n Returns:\n tuple: A tuple containing the following elements:\n - pandas.DataFrame: A DataFrame containing the outliers in the 'closing_price' column.\n - matplotlib.axes._subplots.AxesSubplot: The plot object displaying the outliers.\n \n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n - scipy.stats.zscore\n \n Constants:\n - Z-Score threshold for identifying outliers is customizable via the 'z_threshold' parameter.\n \n Examples:\n >>> df1 = pd.DataFrame({\n ... 'closing_price': [100, 101, 102, 103, 104, 150]\n ... })\n >>> outliers1, plot1 = f_757(df1)\n \n >>> df2 = pd.DataFrame({\n ... 'closing_price': [10, 20, 30, 40, 50, 100]\n ... })\n >>> outliers2, plot2 = f_757(df2, z_threshold=1.5)\n \"\"\"", "canonical_solution": " # Calculate Z-Scores for the 'closing_price' column\n df['Z_score'] = zscore(df['closing_price'])\n \n # Identify outliers based on Z-Score threshold\n outliers = df[np.abs(df['Z_score']) > z_threshold]\n \n # Create the plot\n fig, ax = plt.subplots(figsize=(10, 5))\n ax.plot(df['closing_price'], color='blue', label='Normal')\n ax.plot(outliers['closing_price'], linestyle='none', marker='X', color='red', markersize=12, label='Outlier')\n ax.set_xlabel('Index')\n ax.set_ylabel('Closing Price')\n ax.set_title('Outliers in Closing Prices')\n ax.legend(loc='best')\n \n return outliers, ax", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n df1 = pd.DataFrame({\n 'closing_price': [100, 101, 102, 103, 104, 150]\n })\n outliers1, plot1 = f_757(df1)\n self.assertEqual(outliers1['closing_price'].tolist(), [150])\n self.assertEqual(plot1.get_title(), 'Outliers in Closing Prices')\n self.assertEqual(plot1.get_xlabel(), 'Index')\n self.assertEqual(plot1.get_ylabel(), 'Closing Price')\n \n def test_case_2(self):\n df2 = pd.DataFrame({\n 'closing_price': [10, 20, 30, 40, 50, 100]\n })\n outliers2, plot2 = f_757(df2, z_threshold=1.5)\n self.assertEqual(outliers2['closing_price'].tolist(), [100])\n self.assertEqual(outliers2['Z_score'].tolist(), [2.004094170098539])\n \n def test_case_3(self):\n df3 = pd.DataFrame({\n 'closing_price': [112,23,23,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]\n })\n outliers3, plot3 = f_757(df3, z_threshold=3)\n self.assertEqual(outliers3['closing_price'].tolist(), [112])\n self.assertEqual(outliers3['Z_score'].tolist(), [4.309576782241563])\n def test_case_4(self):\n df3 = pd.DataFrame({\n 'closing_price': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 112]\n })\n outliers3, plot3 = f_757(df3, z_threshold=-1)\n self.assertEqual(outliers3['closing_price'].tolist(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 112])\n self.assertEqual(outliers3['Z_score'].tolist(), [-0.46136484230149855, -0.42883270598536727, -0.39630056966923594, -0.36376843335310466, -0.3312362970369733, -0.29870416072084205, -0.2661720244047107, -0.2336398880885794, -0.2011077517724481, -0.16857561545631677, 3.1497022887890767])\n \n def test_case_5(self):\n df3 = pd.DataFrame({\n 'closing_price': []\n })\n outliers3, plot3 = f_757(df3, z_threshold=0)\n self.assertEqual(outliers3['closing_price'].tolist(), [])\n self.assertEqual(outliers3['Z_score'].tolist(), [])", "apis": ["numpy.abs", "matplotlib.pyplot.subplots", "scipy.stats.zscore"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Identifies and plots outliers in the 'closing_price' column of a given DataFrame using the Z-Score method.", "Constants:", "- Z-Score threshold for identifying outliers is customizable via the 'z_threshold' parameter.", ">>> df2 = pd.DataFrame({", "... 'closing_price': [10, 20, 30, 40, 50, 100]", "... })", ">>> outliers2, plot2 = f_757(df2, z_threshold=1.5)"], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame that must contain a column named 'closing_price' with numerical values.", "z_threshold (float, optional): The absolute Z-Score threshold for identifying outliers. Default is 2."], "returns": ["tuple: A tuple containing the following elements:", "pandas.DataFrame: A DataFrame containing the outliers in the 'closing_price' column.", "matplotlib.axes._subplots.AxesSubplot: The plot object displaying the outliers."], "reqs": ["pandas", "numpy", "matplotlib.pyplot", "scipy.stats.zscore"], "raises": [], "example": ["Examples:", ">>> df1 = pd.DataFrame({", "... 'closing_price': [100, 101, 102, 103, 104, 150]", "... })", ">>> outliers1, plot1 = f_757(df1)"]}} -{"task_id": "f_418", "prompt": "import pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_418(df: pd.DataFrame) -> (Counter, plt.Axes):\n \"\"\"\n Identify duplicate entries in a DataFrame and record the age distribution for the duplicate names.\n\n This function takes a DataFrame with 'name' and 'age' columns. If age is provided as floats,\n they will be rounded down to the nearest integer. Age must not be negative, otherwise the function\n raises ValueError. Then, the function identifies duplicate names and records the age distribution.\n It returns a Counter object with the age distribution and a histogram plot showing the distribution\n of ages for duplicate names, with age on the x-axis and count on the y-axis. Bins are calculated\n based on the minimum and maximum ages found among the duplicates, adjusted by .5 to ensure that\n integer ages fall squarely within bins.\n\n Parameters:\n df: pd.DataFrame - A DataFrame with columns 'name' and 'age'.\n Must not be empty. If empty, the function raises ValueError.\n\n Returns:\n Counter: Age distribution among duplicate names.\n plt.Axes or None: Histogram plot displaying age distribution, or None if there are no duplicates.\n\n Requirements:\n - pandas\n - numpy\n - collections.Counter\n - seaborn\n - matplotlib.pyplot\n\n Example:\n >>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Alice'], 'age': [25, 26, 25]})\n >>> duplicates_counter, ax = f_418(df)\n >>> duplicates_counter\n Counter({25: 2})\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if df.empty:\n raise ValueError(\"Input data cannot be empty.\")\n if any(df[\"age\"] < 0):\n raise ValueError(\"Invalid age: age cannot be less than 0.\")\n\n df[\"age\"] = df[\"age\"].apply(np.floor).astype(int)\n\n duplicate_names = (\n df[\"name\"].value_counts()[df[\"name\"].value_counts() > 1].index.tolist()\n )\n duplicates_df = df[df[\"name\"].isin(duplicate_names)]\n duplicates_counter = Counter(duplicates_df[\"age\"])\n\n if duplicates_counter:\n min_age = duplicates_df[\"age\"].min() - 0.5\n max_age = duplicates_df[\"age\"].max() + 0.5\n bins = np.arange(min_age, max_age + 1)\n ax = sns.histplot(duplicates_df[\"age\"], bins=bins)\n plt.xlabel(\"Age\")\n plt.ylabel(\"Count\")\n plt.title(\"Distribution of Ages for Duplicate Names\")\n else:\n ax = None\n\n return duplicates_counter, ax", "test": "import unittest\nfrom collections import Counter\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Set up various test DataFrames for thorough testing\n self.df_valid = pd.DataFrame(\n {\"name\": [\"Alice\", \"Bob\", \"Alice\"], \"age\": [25, 26, 25]}\n )\n self.df_negative_age = pd.DataFrame(\n {\"name\": [\"Alice\", \"Bob\", \"Charlie\"], \"age\": [25, -1, 27]}\n )\n self.df_no_duplicates = pd.DataFrame(\n {\"name\": [\"Alice\", \"Bob\", \"Charlie\"], \"age\": [25, 26, 27]}\n )\n self.df_all_duplicates = pd.DataFrame(\n {\"name\": [\"Alice\", \"Alice\", \"Alice\"], \"age\": [25, 25, 25]}\n )\n self.df_mixed = pd.DataFrame(\n {\n \"name\": [\"Alice\", \"Bob\", \"Alice\", \"Bob\", \"Charlie\"],\n \"age\": [25, 26, 25, 27, 26],\n }\n )\n self.df_floats = pd.DataFrame(\n {\n \"name\": [\"Alice\", \"Bob\", \"Alice\", \"Bob\", \"Charlie\"],\n \"age\": [25.2, 26.1, 25.3, 27.5, 26.8],\n }\n )\n self.df_empty = pd.DataFrame({\"name\": [], \"age\": []})\n def _check_plot(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.get_title())\n self.assertEqual(ax.get_xlabel(), \"Age\")\n self.assertEqual(ax.get_ylabel(), \"Count\")\n def test_case_1(self):\n # Test for a simple valid case with duplicates\n result, ax = f_418(self.df_valid)\n expected = Counter({25: 2})\n self.assertEqual(result, expected)\n self._check_plot(ax)\n def test_case_2(self):\n # Test for handling of negative ages\n with self.assertRaises(ValueError):\n f_418(self.df_negative_age)\n def test_case_3(self):\n # Test for no duplicates\n result, ax = f_418(self.df_no_duplicates)\n expected = Counter()\n self.assertEqual(result, expected)\n self.assertIsNone(ax)\n def test_case_4(self):\n # Test for all entries being duplicates\n result, ax = f_418(self.df_all_duplicates)\n expected = Counter({25: 3})\n self.assertEqual(result, expected)\n self._check_plot(ax)\n def test_case_5(self):\n # Test for a mix of duplicates and unique names\n result, ax = f_418(self.df_mixed)\n expected = Counter({25: 2, 26: 1, 27: 1})\n self.assertEqual(result, expected)\n self._check_plot(ax)\n def test_case_6(self):\n # Test for floats\n result, ax = f_418(self.df_floats)\n expected = Counter({25: 2, 26: 1, 27: 1})\n self.assertEqual(result, expected)\n self._check_plot(ax)\n def test_case_7(self):\n # Test for an empty DataFrame\n with self.assertRaises(ValueError):\n f_418(self.df_empty)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "collections.Counter", "seaborn.histplot", "numpy.floor", "matplotlib.pyplot.xlabel", "matplotlib.pyplot.Axes", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "pandas.DataFrame"], "libs": ["numpy", "collections", "matplotlib", "pandas", "seaborn"], "doc": {"description": ["Identify duplicate entries in a DataFrame and record the age distribution for the duplicate names.", "This function takes a DataFrame with 'name' and 'age' columns. If age is provided as floats,", "they will be rounded down to the nearest integer. Age must not be negative, otherwise the function", "raises ValueError. Then, the function identifies duplicate names and records the age distribution.", "It returns a Counter object with the age distribution and a histogram plot showing the distribution", "of ages for duplicate names, with age on the x-axis and count on the y-axis. Bins are calculated", "based on the minimum and maximum ages found among the duplicates, adjusted by .5 to ensure that", "integer ages fall squarely within bins."], "note": [], "params": ["df: pd.DataFrame - A DataFrame with columns 'name' and 'age'.", "Must not be empty. If empty, the function raises ValueError."], "returns": ["Counter: Age distribution among duplicate names.", "plt.Axes or None: Histogram plot displaying age distribution, or None if there are no duplicates."], "reqs": ["pandas", "numpy", "collections.Counter", "seaborn", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Alice'], 'age': [25, 26, 25]})", ">>> duplicates_counter, ax = f_418(df)", ">>> duplicates_counter", "Counter({25: 2})", ">>> type(ax)", ""]}} -{"task_id": "f_401", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_401(column, data):\n \"\"\"\n Analyze a list of fitness data, calculate the sum, the mean, the minimum,\n the maximum of a certain column and draw a line chart. Additionally, validate\n that the numeric values for steps, calories burned, and distance walked are\n non-negative.\n\n Parameters:\n column (str): The column to analyze from the data. The allowed columns are:\n 'Date', 'Steps', 'Calories Burned', 'Distance Walked'.\n data (list of list): A list where each inner list contains a datetime object\n representing the date, followed by numeric values for steps,\n calories burned, and distance walked in that order. Each\n numeric value must be non-negative. Must not be empty.\n\n Returns:\n tuple: A tuple containing:\n - dict: A dictionary with the sum, mean, min, max of the column.\n - matplotlib.axes.Axes: The Axes object of the plotted line chart. The line\n chart will have Date on its x-axis, the column value\n on its y-axis, and title Line Chart of (column).\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> data = [[datetime(2022, 1, 1), 5000, 200, 3.5],\n ... [datetime(2022, 1, 2), 5500, 220, 4.0],\n ... [datetime(2022, 1, 3), 6000, 240, 4.5]]\n >>> stats, ax = f_401('Steps', data)\n >>> type(ax)\n \n >>> print(stats)\n {'sum': 16500, 'mean': 5500.0, 'min': 5000, 'max': 6000}\n \"\"\"", "canonical_solution": " COLUMNS = [\"Date\", \"Steps\", \"Calories Burned\", \"Distance Walked\"]\n if column not in COLUMNS:\n raise KeyError(f\"{column} is not a valid column. Choose from {COLUMNS}.\")\n\n if not data:\n raise ValueError(\"No data to plot.\")\n df = pd.DataFrame(data, columns=COLUMNS)\n if df[[\"Steps\", \"Calories Burned\", \"Distance Walked\"]].lt(0).any().any():\n raise ValueError(\n \"Numeric values for steps, calories burned, and distance walked must be non-negative.\"\n )\n\n column_data = df[column]\n result = {\n \"sum\": np.sum(column_data),\n \"mean\": np.mean(column_data),\n \"min\": np.min(column_data),\n \"max\": np.max(column_data),\n }\n\n ax = df.plot.line(x=\"Date\", y=column)\n ax.set_ylabel(column)\n plt.title(f\"Line Chart of {column}\")\n\n return result, ax", "test": "import unittest\nfrom datetime import datetime\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n data = [\n [datetime(2022, 1, 1), 5000, 200, 3.5],\n [datetime(2022, 1, 2), 5500, 220, 4.0],\n [datetime(2022, 1, 3), 6000, 240, 4.5],\n ]\n stats, ax = f_401(\"Steps\", data)\n self.assertEqual(\n stats, {\"sum\": 16500, \"mean\": 5500.0, \"min\": 5000, \"max\": 6000}\n )\n self.assertEqual(ax.get_title(), \"Line Chart of Steps\")\n def test_case_2(self):\n data = [\n [datetime(2022, 1, 1), 5000, 250, 3.5],\n [datetime(2022, 1, 2), 5500, 275, 4.0],\n [datetime(2022, 1, 3), 6000, 300, 4.5],\n ]\n stats, ax = f_401(\"Calories Burned\", data)\n self.assertEqual(stats, {\"sum\": 825, \"mean\": 275.0, \"min\": 250, \"max\": 300})\n self.assertEqual(ax.get_title(), \"Line Chart of Calories Burned\")\n def test_case_3(self):\n data = [\n [datetime(2022, 1, i), 5000 + i * 100, 250 + i * 10, 3.5 + i * 0.1]\n for i in range(1, 11)\n ]\n stats, ax = f_401(\"Distance Walked\", data)\n self.assertEqual(stats, {\"sum\": 40.5, \"mean\": 4.05, \"min\": 3.6, \"max\": 4.5})\n self.assertEqual(ax.get_title(), \"Line Chart of Distance Walked\")\n def test_case_4(self):\n # Test handling zeros\n data = [\n [datetime(2022, 1, 1), 0, 0, 0],\n [datetime(2022, 1, 2), 0, 0, 0],\n [datetime(2022, 1, 3), 0, 0, 0],\n ]\n stats, ax = f_401(\"Steps\", data)\n self.assertEqual(stats, {\"sum\": 0, \"mean\": 0.0, \"min\": 0, \"max\": 0})\n self.assertEqual(ax.get_title(), \"Line Chart of Steps\")\n def test_case_5(self):\n # Test larger values\n data = [\n [datetime(2022, 1, 1), 100000, 10000, 1000],\n [datetime(2022, 1, 2), 100000, 10000, 1000],\n [datetime(2022, 1, 3), 100000, 10000, 1000],\n ]\n stats, ax = f_401(\"Calories Burned\", data)\n self.assertEqual(\n stats, {\"sum\": 30000, \"mean\": 10000.0, \"min\": 10000, \"max\": 10000}\n )\n self.assertEqual(ax.get_title(), \"Line Chart of Calories Burned\")\n def test_case_6(self):\n # Test invalid column names\n data = [[datetime(2022, 1, 1), 5000, 200, 3.5]]\n with self.assertRaises(Exception):\n f_401(\"Invalid Column\", data)\n def test_case_7(self):\n # Test negative values\n data = [[datetime(2022, 1, 1), -5000, 200, 3.5]]\n with self.assertRaises(ValueError):\n f_401(\"Steps\", data)\n def test_case_8(self):\n # Test single row\n data = [[datetime(2022, 1, 1), 5000, 200, 3.5]]\n stats, _ = f_401(\"Steps\", data)\n self.assertEqual(stats, {\"sum\": 5000, \"mean\": 5000.0, \"min\": 5000, \"max\": 5000})\n def test_case_9(self):\n # Test non-sequential dates\n data = [\n [datetime(2022, 1, 3), 6000, 240, 4.5],\n [datetime(2022, 1, 1), 5000, 200, 3.5],\n [datetime(2022, 1, 2), 5500, 220, 4.0],\n ]\n stats, _ = f_401(\"Steps\", data)\n # Check data order doesn't affect calculation\n expected_stats = {\"sum\": 16500, \"mean\": 5500.0, \"min\": 5000, \"max\": 6000}\n self.assertEqual(stats, expected_stats)\n def test_case_10(self):\n # Test empty data\n data = []\n with self.assertRaises(Exception):\n f_401(\"Steps\", data)\n def test_case_11(self):\n # Test to ensure plot title and axis labels are correctly set\n data = [\n [datetime(2022, 1, 1), 5000, 200, 3.5],\n [datetime(2022, 1, 2), 5500, 220, 4.0],\n [datetime(2022, 1, 3), 6000, 240, 4.5],\n ]\n _, ax = f_401(\"Steps\", data)\n self.assertEqual(ax.get_title(), \"Line Chart of Steps\")\n self.assertEqual(ax.get_xlabel(), \"Date\")\n self.assertEqual(ax.get_ylabel(), \"Steps\")\n def test_case_12(self):\n # Test to verify if the correct data points are plotted\n data = [\n [datetime(2022, 1, 1), 100, 50, 1.0],\n [datetime(2022, 1, 2), 200, 100, 2.0],\n ]\n _, ax = f_401(\"Distance Walked\", data)\n lines = ax.get_lines()\n _, y_data = lines[0].get_data()\n expected_y = np.array([1.0, 2.0])\n np.testing.assert_array_equal(y_data, expected_y)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.mean", "matplotlib.pyplot.title", "numpy.min", "numpy.max", "numpy.sum", "pandas.DataFrame"], "libs": ["numpy", "matplotlib", "pandas"], "doc": {"description": ["Analyze a list of fitness data, calculate the sum, the mean, the minimum,", "the maximum of a certain column and draw a line chart. Additionally, validate", "that the numeric values for steps, calories burned, and distance walked are", "non-negative."], "note": [], "params": ["column (str): The column to analyze from the data. The allowed columns are:", "'Date', 'Steps', 'Calories Burned', 'Distance Walked'.", "data (list of list): A list where each inner list contains a datetime object", "representing the date, followed by numeric values for steps,", "calories burned, and distance walked in that order. Each", "numeric value must be non-negative. Must not be empty."], "returns": ["tuple: A tuple containing:", "dict: A dictionary with the sum, mean, min, max of the column.", "matplotlib.axes.Axes: The Axes object of the plotted line chart. The line", "chart will have Date on its x-axis, the column value", "on its y-axis, and title Line Chart of (column)."], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [[datetime(2022, 1, 1), 5000, 200, 3.5],", "... [datetime(2022, 1, 2), 5500, 220, 4.0],", "... [datetime(2022, 1, 3), 6000, 240, 4.5]]", ">>> stats, ax = f_401('Steps', data)", ">>> type(ax)", "", ">>> print(stats)", "{'sum': 16500, 'mean': 5500.0, 'min': 5000, 'max': 6000}"]}} -{"task_id": "f_778", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport string\n\n# Constants\nALPHABET = list(string.ascii_lowercase)\n\ndef f_778(word):\n \"\"\"\n Draws a bar chart representing the positions of each letter in the given word \n within the English alphabet using numpy and matplotlib.pyplot.\n \n Parameters:\n word (str): The word whose letters' positions will be plotted. \n Should contain only lowercase alphabetic characters.\n \n Returns:\n AxesSubplot: A matplotlib.axes._subplots.AxesSubplot object representing the generated plot.\n \n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Constants:\n - ALPHABET: A list containing all lowercase letters of the English alphabet.\n \n Examples:\n >>> ax = f_778('abc')\n >>> ax = f_778('hello')\n \n Note: \n The function uses the index of each letter in the English alphabet to represent its position.\n For example, 'a' will be represented by 1, 'b' by 2, and so on.\n \"\"\"", "canonical_solution": " # Validate the input word to contain only alphabetic characters\n if not all(char in ALPHABET for char in word):\n raise ValueError(\"The word should contain only lowercase alphabetic characters.\")\n \n # Calculate the positions of each letter in the word within the alphabet\n letter_positions = np.array(list(map(lambda x: ALPHABET.index(x) + 1, word)))\n \n # Create a figure and axis object\n fig, ax = plt.subplots()\n \n # Draw the bar chart on the axis\n ax.bar(np.arange(len(letter_positions)), letter_positions)\n \n # Configure plot settings\n ax.set_xlabel('Letter Index')\n ax.set_ylabel('Alphabetical Position')\n ax.set_title('Alphabetical Position of Letters in Word')\n \n plt.show()\n \n return ax", "test": "import unittest\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n ax = f_778('abc')\n self.assertIsInstance(ax, Axes, \"The returned object is not an instance of Axes.\")\n self.assertEqual(ax.patches[0].get_height(), 1, \"The height of the first bar should be 1.\")\n self.assertEqual(ax.patches[1].get_height(), 2, \"The height of the second bar should be 2.\")\n self.assertEqual(ax.patches[2].get_height(), 3, \"The height of the third bar should be 3.\")\n \n def test_case_2(self):\n ax = f_778('xyz')\n self.assertIsInstance(ax, Axes, \"The returned object is not an instance of Axes.\")\n self.assertEqual(ax.patches[0].get_height(), 24, \"The height of the first bar should be 24.\")\n self.assertEqual(ax.patches[1].get_height(), 25, \"The height of the second bar should be 25.\")\n self.assertEqual(ax.patches[2].get_height(), 26, \"The height of the third bar should be 26.\")\n \n def test_case_3(self):\n ax = f_778('ace')\n self.assertIsInstance(ax, Axes, \"The returned object is not an instance of Axes.\")\n self.assertEqual(ax.patches[0].get_height(), 1, \"The height of the first bar should be 1.\")\n self.assertEqual(ax.patches[1].get_height(), 3, \"The height of the second bar should be 3.\")\n self.assertEqual(ax.patches[2].get_height(), 5, \"The height of the third bar should be 5.\")\n \n def test_case_4(self):\n ax = f_778('bd')\n self.assertIsInstance(ax, Axes, \"The returned object is not an instance of Axes.\")\n self.assertEqual(ax.patches[0].get_height(), 2, \"The height of the first bar should be 2.\")\n self.assertEqual(ax.patches[1].get_height(), 4, \"The height of the second bar should be 4.\")\n \n def test_case_5(self):\n with self.assertRaises(ValueError):\n f_778('a1b')", "apis": ["numpy.arange", "string.ascii_lowercase", "matplotlib.pyplot.show", "numpy.array", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "string"], "doc": {"description": ["Draws a bar chart representing the positions of each letter in the given word", "within the English alphabet using numpy and matplotlib.pyplot.", "Constants:", "- ALPHABET: A list containing all lowercase letters of the English alphabet."], "note": ["The function uses the index of each letter in the English alphabet to represent its position.", "For example, 'a' will be represented by 1, 'b' by 2, and so on."], "params": ["word (str): The word whose letters' positions will be plotted.", "Should contain only lowercase alphabetic characters."], "returns": ["AxesSubplot: A matplotlib.axes._subplots.AxesSubplot object representing the generated plot."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": ["Examples:", ">>> ax = f_778('abc')", ">>> ax = f_778('hello')"]}} +{"task_id": "f_354", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_354(data: pd.DataFrame) -> (pd.DataFrame, list):\n \"\"\"\n This function takes a pandas DataFrame and standardizes its features using sklearn's StandardScaler,\n which standardizes features by removing the mean and scaling to unit variance.\n After standardization, it draws a histogram for each feature with 20 bins.\n\n Parameters:\n - data (pd.DataFrame): The input data to be standardized and plotted. It is expected to have\n columns named 'Feature1', 'Feature2', 'Feature3', 'Feature4', and 'Feature5'.\n If there are additional data columns, they are ignored.\n\n\n Returns:\n - standardized_data (pd.DataFrame): The standardized data.\n - axes_list (list): A list of matplotlib Axes objects representing the histograms for each feature.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n - sklearn.preprocessing.StandardScaler\n \n Example:\n >>> data = pd.DataFrame({\n ... 'Feature1': [0.5, 0.6, 0.7, 0.8, 0.9],\n ... 'Feature2': [0.1, 0.2, 0.3, 0.4, 0.5],\n ... 'Feature3': [0.9, 0.8, 0.7, 0.6, 0.5],\n ... 'Feature4': [0.5, 0.4, 0.3, 0.2, 0.1],\n ... 'Feature5': [0.1, 0.3, 0.5, 0.7, 0.9]\n ... })\n >>> standardized_data, axes_list = f_354(data)\n >>> type(standardized_data)\n \n >>> axes_list\n [, , , , ]\n >>> type(axes_list[0])\n \n \"\"\"", "canonical_solution": " FEATURES = [\"Feature1\", \"Feature2\", \"Feature3\", \"Feature4\", \"Feature5\"]\n\n scaler = StandardScaler()\n data_standardized = pd.DataFrame(\n scaler.fit_transform(data[FEATURES]), columns=FEATURES\n )\n\n axes_list = []\n for feature in FEATURES:\n fig, ax = plt.subplots()\n ax.hist(data_standardized[feature], bins=20, alpha=0.5)\n ax.set_title(\"Histogram of {}\".format(feature))\n axes_list.append(ax)\n\n return data_standardized, axes_list", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.columns = [\"Feature1\", \"Feature2\", \"Feature3\", \"Feature4\", \"Feature5\"]\n np.random.seed(0)\n def test_case_1(self):\n # Test basic case\n data = pd.DataFrame(\n np.random.rand(100, 5),\n columns=self.columns,\n )\n self.standardized_data_test(data)\n def test_case_2(self):\n # Test standardizing different distribution\n data = pd.DataFrame(\n np.random.exponential(scale=1.0, size=(100, 5)),\n columns=self.columns,\n )\n self.standardized_data_test(data)\n def test_case_3(self):\n # Test standardizing data combined from different distributions\n data_1 = np.random.rand(100, 3)\n data_2 = np.random.exponential(scale=1.0, size=(100, 2))\n data = pd.DataFrame(\n np.hstack((data_1, data_2)),\n columns=self.columns,\n )\n self.standardized_data_test(data)\n def test_case_4(self):\n # Test the function with highly skewed data\n data = pd.DataFrame(\n np.random.chisquare(df=1, size=(100, 5)),\n columns=self.columns,\n )\n standardized_data, _ = f_354(data)\n self.assertTrue(np.isclose(standardized_data.std().values, 1, atol=1e-1).all())\n def test_case_5(self):\n # Test function with a dataframe that has only one row\n data = pd.DataFrame(\n {\n \"Feature1\": [0.1],\n \"Feature2\": [0.2],\n \"Feature3\": [0.3],\n \"Feature4\": [0.4],\n \"Feature5\": [0.5],\n }\n )\n _, axes_list = f_354(data)\n self.assertEqual(len(axes_list), 5)\n def test_case_6(self):\n # Test with columns having identical values across all rows.\n data = pd.DataFrame(\n {\n \"Feature1\": [0.1] * 100,\n \"Feature2\": [0.2] * 100,\n \"Feature3\": [0.3] * 100,\n \"Feature4\": [0.4] * 100,\n \"Feature5\": [0.5] * 100,\n }\n )\n standardized_data, _ = f_354(data)\n # Identical values become NaN after standardization because variance is 0\n expected_zeros = pd.DataFrame(\n 0,\n index=np.arange(100),\n columns=self.columns,\n )\n self.assertTrue(np.isclose(standardized_data, expected_zeros).all().all())\n def test_case_7(self):\n # Test with additional columns not in the expected FEATURES set\n data = pd.DataFrame(\n np.random.rand(100, 7),\n columns=self.columns\n + [\n \"Extra1\",\n \"Extra2\",\n ],\n )\n _, axes_list = f_354(data)\n self.assertEqual(len(axes_list), 5)\n def test_case_8(self):\n # Test with missing columns from the expected FEATURES set\n data = pd.DataFrame(\n np.random.rand(100, 3), columns=[\"Feature1\", \"Feature2\", \"Feature3\"]\n )\n with self.assertRaises(KeyError):\n f_354(data)\n def test_case_9(self):\n # Test should fail when there is invalid input - empty dataframe\n data = pd.DataFrame()\n with self.assertRaises(KeyError):\n f_354(data)\n def test_case_10(self):\n # Test should fail when there is invalid input - NaN\n data = pd.DataFrame(\n {\n \"Feature1\": [np.nan, 0.2, 0.3],\n \"Feature2\": [0.1, np.nan, 0.3],\n \"Feature3\": [0.2, 0.2, np.nan],\n \"Feature4\": [np.nan, 0.4, 0.5],\n \"Feature5\": [0.5, 0.6, np.nan],\n }\n )\n standardized_data, _ = f_354(data)\n self.assertTrue(standardized_data.isnull().any().any())\n def test_case_11(self):\n # Test should fail when there is invalid input - inf\n data = pd.DataFrame(\n {\n \"Feature1\": [np.inf, 0.2, 0.3],\n \"Feature2\": [0.1, -np.inf, 0.3],\n \"Feature3\": [0.2, 0.2, np.inf],\n \"Feature4\": [-np.inf, 0.4, 0.5],\n \"Feature5\": [0.5, 0.6, -np.inf],\n }\n )\n with self.assertRaises(ValueError):\n f_354(data)\n def test_case_12(self):\n # Test the function with non-numeric columns.\n data = pd.DataFrame(\n {\n \"Feature1\": [\"a\", \"b\", \"c\"],\n \"Feature2\": [\"d\", \"e\", \"f\"],\n \"Feature3\": [\"g\", \"h\", \"i\"],\n \"Feature4\": [\"j\", \"k\", \"l\"],\n \"Feature5\": [\"m\", \"n\", \"o\"],\n }\n )\n with self.assertRaises(ValueError):\n f_354(data)\n def test_case_13(self):\n # Function should fail if more than expected number of features (5)\n data = pd.DataFrame(np.random.rand(100, 50))\n with self.assertRaises(KeyError):\n f_354(data)\n def standardized_data_test(self, data):\n np.random.seed(0)\n standardized_data, axes_list = f_354(data)\n # Check if the data is standardized (mean ~ 0 and standard deviation ~ 1)\n self.assertTrue(np.isclose(standardized_data.mean().values, 0, atol=1e-2).all())\n self.assertTrue(np.isclose(standardized_data.std().values, 1, atol=1e-1).all())\n # Check the number of returned histograms\n self.assertEqual(len(axes_list), 5)\n # Check if each histogram is correctly titled\n for ax, feature in zip(axes_list, self.columns):\n self.assertEqual(ax.get_title(), f\"Histogram of {feature}\")\n # Check if histograms have the right number of bins\n for ax in axes_list:\n self.assertEqual(len(ax.patches), 20)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "sklearn.preprocessing.StandardScaler", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["This function takes a pandas DataFrame and standardizes its features using sklearn's StandardScaler,", "which standardizes features by removing the mean and scaling to unit variance.", "After standardization, it draws a histogram for each feature with 20 bins."], "note": [], "params": ["data (pd.DataFrame): The input data to be standardized and plotted. It is expected to have", "columns named 'Feature1', 'Feature2', 'Feature3', 'Feature4', and 'Feature5'.", "If there are additional data columns, they are ignored."], "returns": ["standardized_data (pd.DataFrame): The standardized data.", "axes_list (list): A list of matplotlib Axes objects representing the histograms for each feature."], "reqs": ["pandas", "matplotlib.pyplot", "sklearn.preprocessing.StandardScaler"], "raises": [], "example": [">>> data = pd.DataFrame({", "... 'Feature1': [0.5, 0.6, 0.7, 0.8, 0.9],", "... 'Feature2': [0.1, 0.2, 0.3, 0.4, 0.5],", "... 'Feature3': [0.9, 0.8, 0.7, 0.6, 0.5],", "... 'Feature4': [0.5, 0.4, 0.3, 0.2, 0.1],", "... 'Feature5': [0.1, 0.3, 0.5, 0.7, 0.9]", "... })", ">>> standardized_data, axes_list = f_354(data)", ">>> type(standardized_data)", "", ">>> axes_list", "[, , , , ]", ">>> type(axes_list[0])", ""]}} +{"task_id": "f_757", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import zscore\n\ndef f_757(df, z_threshold=2):\n \"\"\"\n Identifies and plots outliers in the 'closing_price' column of a given DataFrame using the Z-Score method.\n \n Parameters:\n df (pandas.DataFrame): The input DataFrame that must contain a column named 'closing_price' with numerical values.\n z_threshold (float, optional): The absolute Z-Score threshold for identifying outliers. Default is 2.\n \n Returns:\n tuple: A tuple containing the following elements:\n - pandas.DataFrame: A DataFrame containing the outliers in the 'closing_price' column.\n - matplotlib.axes._subplots.Axes: The plot object displaying the outliers.\n \n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n - scipy.stats.zscore\n \n Constants:\n - Z-Score threshold for identifying outliers is customizable via the 'z_threshold' parameter.\n \n Examples:\n >>> df1 = pd.DataFrame({\n ... 'closing_price': [100, 101, 102, 103, 104, 150]\n ... })\n >>> outliers1, plot1 = f_757(df1)\n \n >>> df2 = pd.DataFrame({\n ... 'closing_price': [10, 20, 30, 40, 50, 100]\n ... })\n >>> outliers2, plot2 = f_757(df2, z_threshold=1.5)\n \"\"\"", "canonical_solution": " # Calculate Z-Scores for the 'closing_price' column\n df['Z_score'] = zscore(df['closing_price'])\n \n # Identify outliers based on Z-Score threshold\n outliers = df[np.abs(df['Z_score']) > z_threshold]\n \n # Create the plot\n fig, ax = plt.subplots(figsize=(10, 5))\n ax.plot(df['closing_price'], color='blue', label='Normal')\n ax.plot(outliers['closing_price'], linestyle='none', marker='X', color='red', markersize=12, label='Outlier')\n ax.set_xlabel('Index')\n ax.set_ylabel('Closing Price')\n ax.set_title('Outliers in Closing Prices')\n ax.legend(loc='best')\n \n return outliers, ax", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n df1 = pd.DataFrame({\n 'closing_price': [100, 101, 102, 103, 104, 150]\n })\n outliers1, plot1 = f_757(df1)\n self.assertEqual(outliers1['closing_price'].tolist(), [150])\n self.assertEqual(plot1.get_title(), 'Outliers in Closing Prices')\n self.assertEqual(plot1.get_xlabel(), 'Index')\n self.assertEqual(plot1.get_ylabel(), 'Closing Price')\n \n def test_case_2(self):\n df2 = pd.DataFrame({\n 'closing_price': [10, 20, 30, 40, 50, 100]\n })\n outliers2, plot2 = f_757(df2, z_threshold=1.5)\n self.assertEqual(outliers2['closing_price'].tolist(), [100])\n self.assertEqual(outliers2['Z_score'].tolist(), [2.004094170098539])\n \n def test_case_3(self):\n df3 = pd.DataFrame({\n 'closing_price': [112,23,23,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]\n })\n outliers3, plot3 = f_757(df3, z_threshold=3)\n self.assertEqual(outliers3['closing_price'].tolist(), [112])\n self.assertEqual(outliers3['Z_score'].tolist(), [4.309576782241563])\n def test_case_4(self):\n df3 = pd.DataFrame({\n 'closing_price': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 112]\n })\n outliers3, plot3 = f_757(df3, z_threshold=-1)\n self.assertEqual(outliers3['closing_price'].tolist(), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 112])\n self.assertEqual(outliers3['Z_score'].tolist(), [-0.46136484230149855, -0.42883270598536727, -0.39630056966923594, -0.36376843335310466, -0.3312362970369733, -0.29870416072084205, -0.2661720244047107, -0.2336398880885794, -0.2011077517724481, -0.16857561545631677, 3.1497022887890767])\n \n def test_case_5(self):\n df3 = pd.DataFrame({\n 'closing_price': []\n })\n outliers3, plot3 = f_757(df3, z_threshold=0)\n self.assertEqual(outliers3['closing_price'].tolist(), [])\n self.assertEqual(outliers3['Z_score'].tolist(), [])", "apis": ["matplotlib.pyplot.subplots", "numpy.abs", "scipy.stats.zscore"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Identifies and plots outliers in the 'closing_price' column of a given DataFrame using the Z-Score method.", "Constants:", "- Z-Score threshold for identifying outliers is customizable via the 'z_threshold' parameter.", ">>> df2 = pd.DataFrame({", "... 'closing_price': [10, 20, 30, 40, 50, 100]", "... })", ">>> outliers2, plot2 = f_757(df2, z_threshold=1.5)"], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame that must contain a column named 'closing_price' with numerical values.", "z_threshold (float, optional): The absolute Z-Score threshold for identifying outliers. Default is 2."], "returns": ["tuple: A tuple containing the following elements:", "pandas.DataFrame: A DataFrame containing the outliers in the 'closing_price' column.", "matplotlib.axes._subplots.Axes: The plot object displaying the outliers."], "reqs": ["pandas", "numpy", "matplotlib.pyplot", "scipy.stats.zscore"], "raises": [], "example": ["Examples:", ">>> df1 = pd.DataFrame({", "... 'closing_price': [100, 101, 102, 103, 104, 150]", "... })", ">>> outliers1, plot1 = f_757(df1)"]}} +{"task_id": "f_418", "prompt": "import pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_418(df: pd.DataFrame) -> (Counter, plt.Axes):\n \"\"\"\n Identify duplicate entries in a DataFrame and record the age distribution for the duplicate names.\n\n This function takes a DataFrame with 'name' and 'age' columns. If age is provided as floats,\n they will be rounded down to the nearest integer. Age must not be negative, otherwise the function\n raises ValueError. Then, the function identifies duplicate names and records the age distribution.\n It returns a Counter object with the age distribution and a histogram plot showing the distribution\n of ages for duplicate names, with age on the x-axis and count on the y-axis. Bins are calculated\n based on the minimum and maximum ages found among the duplicates, adjusted by .5 to ensure that\n integer ages fall squarely within bins.\n\n Parameters:\n df: pd.DataFrame - A DataFrame with columns 'name' and 'age'.\n Must not be empty. If empty, the function raises ValueError.\n\n Returns:\n Counter: Age distribution among duplicate names.\n plt.Axes or None: Histogram plot displaying age distribution, or None if there are no duplicates.\n\n Requirements:\n - pandas\n - numpy\n - collections.Counter\n - seaborn\n - matplotlib.pyplot\n\n Example:\n >>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Alice'], 'age': [25, 26, 25]})\n >>> duplicates_counter, ax = f_418(df)\n >>> duplicates_counter\n Counter({25: 2})\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if df.empty:\n raise ValueError(\"Input data cannot be empty.\")\n if any(df[\"age\"] < 0):\n raise ValueError(\"Invalid age: age cannot be less than 0.\")\n\n df[\"age\"] = df[\"age\"].apply(np.floor).astype(int)\n\n duplicate_names = (\n df[\"name\"].value_counts()[df[\"name\"].value_counts() > 1].index.tolist()\n )\n duplicates_df = df[df[\"name\"].isin(duplicate_names)]\n duplicates_counter = Counter(duplicates_df[\"age\"])\n\n if duplicates_counter:\n min_age = duplicates_df[\"age\"].min() - 0.5\n max_age = duplicates_df[\"age\"].max() + 0.5\n bins = np.arange(min_age, max_age + 1)\n ax = sns.histplot(duplicates_df[\"age\"], bins=bins)\n plt.xlabel(\"Age\")\n plt.ylabel(\"Count\")\n plt.title(\"Distribution of Ages for Duplicate Names\")\n else:\n ax = None\n\n return duplicates_counter, ax", "test": "import unittest\nfrom collections import Counter\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Set up various test DataFrames for thorough testing\n self.df_valid = pd.DataFrame(\n {\"name\": [\"Alice\", \"Bob\", \"Alice\"], \"age\": [25, 26, 25]}\n )\n self.df_negative_age = pd.DataFrame(\n {\"name\": [\"Alice\", \"Bob\", \"Charlie\"], \"age\": [25, -1, 27]}\n )\n self.df_no_duplicates = pd.DataFrame(\n {\"name\": [\"Alice\", \"Bob\", \"Charlie\"], \"age\": [25, 26, 27]}\n )\n self.df_all_duplicates = pd.DataFrame(\n {\"name\": [\"Alice\", \"Alice\", \"Alice\"], \"age\": [25, 25, 25]}\n )\n self.df_mixed = pd.DataFrame(\n {\n \"name\": [\"Alice\", \"Bob\", \"Alice\", \"Bob\", \"Charlie\"],\n \"age\": [25, 26, 25, 27, 26],\n }\n )\n self.df_floats = pd.DataFrame(\n {\n \"name\": [\"Alice\", \"Bob\", \"Alice\", \"Bob\", \"Charlie\"],\n \"age\": [25.2, 26.1, 25.3, 27.5, 26.8],\n }\n )\n self.df_empty = pd.DataFrame({\"name\": [], \"age\": []})\n def _check_plot(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.get_title())\n self.assertEqual(ax.get_xlabel(), \"Age\")\n self.assertEqual(ax.get_ylabel(), \"Count\")\n def test_case_1(self):\n # Test for a simple valid case with duplicates\n result, ax = f_418(self.df_valid)\n expected = Counter({25: 2})\n self.assertEqual(result, expected)\n self._check_plot(ax)\n def test_case_2(self):\n # Test for handling of negative ages\n with self.assertRaises(ValueError):\n f_418(self.df_negative_age)\n def test_case_3(self):\n # Test for no duplicates\n result, ax = f_418(self.df_no_duplicates)\n expected = Counter()\n self.assertEqual(result, expected)\n self.assertIsNone(ax)\n def test_case_4(self):\n # Test for all entries being duplicates\n result, ax = f_418(self.df_all_duplicates)\n expected = Counter({25: 3})\n self.assertEqual(result, expected)\n self._check_plot(ax)\n def test_case_5(self):\n # Test for a mix of duplicates and unique names\n result, ax = f_418(self.df_mixed)\n expected = Counter({25: 2, 26: 1, 27: 1})\n self.assertEqual(result, expected)\n self._check_plot(ax)\n def test_case_6(self):\n # Test for floats\n result, ax = f_418(self.df_floats)\n expected = Counter({25: 2, 26: 1, 27: 1})\n self.assertEqual(result, expected)\n self._check_plot(ax)\n def test_case_7(self):\n # Test for an empty DataFrame\n with self.assertRaises(ValueError):\n f_418(self.df_empty)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "pandas.DataFrame", "collections.Counter", "numpy.floor", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.Axes", "seaborn.histplot", "matplotlib.pyplot.xlabel"], "libs": ["collections", "numpy", "seaborn", "pandas", "matplotlib"], "doc": {"description": ["Identify duplicate entries in a DataFrame and record the age distribution for the duplicate names.", "This function takes a DataFrame with 'name' and 'age' columns. If age is provided as floats,", "they will be rounded down to the nearest integer. Age must not be negative, otherwise the function", "raises ValueError. Then, the function identifies duplicate names and records the age distribution.", "It returns a Counter object with the age distribution and a histogram plot showing the distribution", "of ages for duplicate names, with age on the x-axis and count on the y-axis. Bins are calculated", "based on the minimum and maximum ages found among the duplicates, adjusted by .5 to ensure that", "integer ages fall squarely within bins."], "note": [], "params": ["df: pd.DataFrame - A DataFrame with columns 'name' and 'age'.", "Must not be empty. If empty, the function raises ValueError."], "returns": ["Counter: Age distribution among duplicate names.", "plt.Axes or None: Histogram plot displaying age distribution, or None if there are no duplicates."], "reqs": ["pandas", "numpy", "collections.Counter", "seaborn", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({'name': ['Alice', 'Bob', 'Alice'], 'age': [25, 26, 25]})", ">>> duplicates_counter, ax = f_418(df)", ">>> duplicates_counter", "Counter({25: 2})", ">>> type(ax)", ""]}} +{"task_id": "f_401", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_401(column, data):\n \"\"\"\n Analyze a list of fitness data, calculate the sum, the mean, the minimum,\n the maximum of a certain column and draw a line chart. Additionally, validate\n that the numeric values for steps, calories burned, and distance walked are\n non-negative.\n\n Parameters:\n column (str): The column to analyze from the data. The allowed columns are:\n 'Date', 'Steps', 'Calories Burned', 'Distance Walked'.\n data (list of list): A list where each inner list contains a datetime object\n representing the date, followed by numeric values for steps,\n calories burned, and distance walked in that order. Each\n numeric value must be non-negative. Must not be empty.\n\n Returns:\n tuple: A tuple containing:\n - dict: A dictionary with the sum, mean, min, max of the column.\n - matplotlib.axes.Axes: The Axes object of the plotted line chart. The line\n chart will have Date on its x-axis, the column value\n on its y-axis, and title Line Chart of (column).\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> data = [[datetime(2022, 1, 1), 5000, 200, 3.5],\n ... [datetime(2022, 1, 2), 5500, 220, 4.0],\n ... [datetime(2022, 1, 3), 6000, 240, 4.5]]\n >>> stats, ax = f_401('Steps', data)\n >>> type(ax)\n \n >>> print(stats)\n {'sum': 16500, 'mean': 5500.0, 'min': 5000, 'max': 6000}\n \"\"\"", "canonical_solution": " COLUMNS = [\"Date\", \"Steps\", \"Calories Burned\", \"Distance Walked\"]\n if column not in COLUMNS:\n raise KeyError(f\"{column} is not a valid column. Choose from {COLUMNS}.\")\n\n if not data:\n raise ValueError(\"No data to plot.\")\n df = pd.DataFrame(data, columns=COLUMNS)\n if df[[\"Steps\", \"Calories Burned\", \"Distance Walked\"]].lt(0).any().any():\n raise ValueError(\n \"Numeric values for steps, calories burned, and distance walked must be non-negative.\"\n )\n\n column_data = df[column]\n result = {\n \"sum\": np.sum(column_data),\n \"mean\": np.mean(column_data),\n \"min\": np.min(column_data),\n \"max\": np.max(column_data),\n }\n\n ax = df.plot.line(x=\"Date\", y=column)\n ax.set_ylabel(column)\n plt.title(f\"Line Chart of {column}\")\n\n return result, ax", "test": "import unittest\nfrom datetime import datetime\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n data = [\n [datetime(2022, 1, 1), 5000, 200, 3.5],\n [datetime(2022, 1, 2), 5500, 220, 4.0],\n [datetime(2022, 1, 3), 6000, 240, 4.5],\n ]\n stats, ax = f_401(\"Steps\", data)\n self.assertEqual(\n stats, {\"sum\": 16500, \"mean\": 5500.0, \"min\": 5000, \"max\": 6000}\n )\n self.assertEqual(ax.get_title(), \"Line Chart of Steps\")\n def test_case_2(self):\n data = [\n [datetime(2022, 1, 1), 5000, 250, 3.5],\n [datetime(2022, 1, 2), 5500, 275, 4.0],\n [datetime(2022, 1, 3), 6000, 300, 4.5],\n ]\n stats, ax = f_401(\"Calories Burned\", data)\n self.assertEqual(stats, {\"sum\": 825, \"mean\": 275.0, \"min\": 250, \"max\": 300})\n self.assertEqual(ax.get_title(), \"Line Chart of Calories Burned\")\n def test_case_3(self):\n data = [\n [datetime(2022, 1, i), 5000 + i * 100, 250 + i * 10, 3.5 + i * 0.1]\n for i in range(1, 11)\n ]\n stats, ax = f_401(\"Distance Walked\", data)\n self.assertEqual(stats, {\"sum\": 40.5, \"mean\": 4.05, \"min\": 3.6, \"max\": 4.5})\n self.assertEqual(ax.get_title(), \"Line Chart of Distance Walked\")\n def test_case_4(self):\n # Test handling zeros\n data = [\n [datetime(2022, 1, 1), 0, 0, 0],\n [datetime(2022, 1, 2), 0, 0, 0],\n [datetime(2022, 1, 3), 0, 0, 0],\n ]\n stats, ax = f_401(\"Steps\", data)\n self.assertEqual(stats, {\"sum\": 0, \"mean\": 0.0, \"min\": 0, \"max\": 0})\n self.assertEqual(ax.get_title(), \"Line Chart of Steps\")\n def test_case_5(self):\n # Test larger values\n data = [\n [datetime(2022, 1, 1), 100000, 10000, 1000],\n [datetime(2022, 1, 2), 100000, 10000, 1000],\n [datetime(2022, 1, 3), 100000, 10000, 1000],\n ]\n stats, ax = f_401(\"Calories Burned\", data)\n self.assertEqual(\n stats, {\"sum\": 30000, \"mean\": 10000.0, \"min\": 10000, \"max\": 10000}\n )\n self.assertEqual(ax.get_title(), \"Line Chart of Calories Burned\")\n def test_case_6(self):\n # Test invalid column names\n data = [[datetime(2022, 1, 1), 5000, 200, 3.5]]\n with self.assertRaises(Exception):\n f_401(\"Invalid Column\", data)\n def test_case_7(self):\n # Test negative values\n data = [[datetime(2022, 1, 1), -5000, 200, 3.5]]\n with self.assertRaises(ValueError):\n f_401(\"Steps\", data)\n def test_case_8(self):\n # Test single row\n data = [[datetime(2022, 1, 1), 5000, 200, 3.5]]\n stats, _ = f_401(\"Steps\", data)\n self.assertEqual(stats, {\"sum\": 5000, \"mean\": 5000.0, \"min\": 5000, \"max\": 5000})\n def test_case_9(self):\n # Test non-sequential dates\n data = [\n [datetime(2022, 1, 3), 6000, 240, 4.5],\n [datetime(2022, 1, 1), 5000, 200, 3.5],\n [datetime(2022, 1, 2), 5500, 220, 4.0],\n ]\n stats, _ = f_401(\"Steps\", data)\n # Check data order doesn't affect calculation\n expected_stats = {\"sum\": 16500, \"mean\": 5500.0, \"min\": 5000, \"max\": 6000}\n self.assertEqual(stats, expected_stats)\n def test_case_10(self):\n # Test empty data\n data = []\n with self.assertRaises(Exception):\n f_401(\"Steps\", data)\n def test_case_11(self):\n # Test to ensure plot title and axis labels are correctly set\n data = [\n [datetime(2022, 1, 1), 5000, 200, 3.5],\n [datetime(2022, 1, 2), 5500, 220, 4.0],\n [datetime(2022, 1, 3), 6000, 240, 4.5],\n ]\n _, ax = f_401(\"Steps\", data)\n self.assertEqual(ax.get_title(), \"Line Chart of Steps\")\n self.assertEqual(ax.get_xlabel(), \"Date\")\n self.assertEqual(ax.get_ylabel(), \"Steps\")\n def test_case_12(self):\n # Test to verify if the correct data points are plotted\n data = [\n [datetime(2022, 1, 1), 100, 50, 1.0],\n [datetime(2022, 1, 2), 200, 100, 2.0],\n ]\n _, ax = f_401(\"Distance Walked\", data)\n lines = ax.get_lines()\n _, y_data = lines[0].get_data()\n expected_y = np.array([1.0, 2.0])\n np.testing.assert_array_equal(y_data, expected_y)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.sum", "pandas.DataFrame", "numpy.mean", "numpy.min", "matplotlib.pyplot.title", "numpy.max"], "libs": ["pandas", "numpy", "matplotlib"], "doc": {"description": ["Analyze a list of fitness data, calculate the sum, the mean, the minimum,", "the maximum of a certain column and draw a line chart. Additionally, validate", "that the numeric values for steps, calories burned, and distance walked are", "non-negative."], "note": [], "params": ["column (str): The column to analyze from the data. The allowed columns are:", "'Date', 'Steps', 'Calories Burned', 'Distance Walked'.", "data (list of list): A list where each inner list contains a datetime object", "representing the date, followed by numeric values for steps,", "calories burned, and distance walked in that order. Each", "numeric value must be non-negative. Must not be empty."], "returns": ["tuple: A tuple containing:", "dict: A dictionary with the sum, mean, min, max of the column.", "matplotlib.axes.Axes: The Axes object of the plotted line chart. The line", "chart will have Date on its x-axis, the column value", "on its y-axis, and title Line Chart of (column)."], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [[datetime(2022, 1, 1), 5000, 200, 3.5],", "... [datetime(2022, 1, 2), 5500, 220, 4.0],", "... [datetime(2022, 1, 3), 6000, 240, 4.5]]", ">>> stats, ax = f_401('Steps', data)", ">>> type(ax)", "", ">>> print(stats)", "{'sum': 16500, 'mean': 5500.0, 'min': 5000, 'max': 6000}"]}} +{"task_id": "f_778", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport string\n\n# Constants\nALPHABET = list(string.ascii_lowercase)\n\ndef f_778(word):\n \"\"\"\n Draws a bar chart representing the positions of each letter in the given word \n within the English alphabet using numpy and matplotlib.pyplot.\n \n Parameters:\n word (str): The word whose letters' positions will be plotted. \n Should contain only lowercase alphabetic characters.\n \n Returns:\n Axes: A matplotlib.axes._subplots.Axes object representing the generated plot.\n \n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Constants:\n - ALPHABET: A list containing all lowercase letters of the English alphabet.\n \n Examples:\n >>> ax = f_778('abc')\n >>> ax = f_778('hello')\n \n Note: \n The function uses the index of each letter in the English alphabet to represent its position.\n For example, 'a' will be represented by 1, 'b' by 2, and so on.\n \"\"\"", "canonical_solution": " # Validate the input word to contain only alphabetic characters\n if not all(char in ALPHABET for char in word):\n raise ValueError(\"The word should contain only lowercase alphabetic characters.\")\n \n # Calculate the positions of each letter in the word within the alphabet\n letter_positions = np.array(list(map(lambda x: ALPHABET.index(x) + 1, word)))\n \n # Create a figure and axis object\n fig, ax = plt.subplots()\n \n # Draw the bar chart on the axis\n ax.bar(np.arange(len(letter_positions)), letter_positions)\n \n # Configure plot settings\n ax.set_xlabel('Letter Index')\n ax.set_ylabel('Alphabetical Position')\n ax.set_title('Alphabetical Position of Letters in Word')\n \n plt.show()\n \n return ax", "test": "import unittest\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n ax = f_778('abc')\n self.assertIsInstance(ax, Axes, \"The returned object is not an instance of Axes.\")\n self.assertEqual(ax.patches[0].get_height(), 1, \"The height of the first bar should be 1.\")\n self.assertEqual(ax.patches[1].get_height(), 2, \"The height of the second bar should be 2.\")\n self.assertEqual(ax.patches[2].get_height(), 3, \"The height of the third bar should be 3.\")\n \n def test_case_2(self):\n ax = f_778('xyz')\n self.assertIsInstance(ax, Axes, \"The returned object is not an instance of Axes.\")\n self.assertEqual(ax.patches[0].get_height(), 24, \"The height of the first bar should be 24.\")\n self.assertEqual(ax.patches[1].get_height(), 25, \"The height of the second bar should be 25.\")\n self.assertEqual(ax.patches[2].get_height(), 26, \"The height of the third bar should be 26.\")\n \n def test_case_3(self):\n ax = f_778('ace')\n self.assertIsInstance(ax, Axes, \"The returned object is not an instance of Axes.\")\n self.assertEqual(ax.patches[0].get_height(), 1, \"The height of the first bar should be 1.\")\n self.assertEqual(ax.patches[1].get_height(), 3, \"The height of the second bar should be 3.\")\n self.assertEqual(ax.patches[2].get_height(), 5, \"The height of the third bar should be 5.\")\n \n def test_case_4(self):\n ax = f_778('bd')\n self.assertIsInstance(ax, Axes, \"The returned object is not an instance of Axes.\")\n self.assertEqual(ax.patches[0].get_height(), 2, \"The height of the first bar should be 2.\")\n self.assertEqual(ax.patches[1].get_height(), 4, \"The height of the second bar should be 4.\")\n \n def test_case_5(self):\n with self.assertRaises(ValueError):\n f_778('a1b')", "apis": ["matplotlib.pyplot.show", "numpy.arange", "numpy.array", "string.ascii_lowercase", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "string"], "doc": {"description": ["Draws a bar chart representing the positions of each letter in the given word", "within the English alphabet using numpy and matplotlib.pyplot.", "Constants:", "- ALPHABET: A list containing all lowercase letters of the English alphabet."], "note": ["The function uses the index of each letter in the English alphabet to represent its position.", "For example, 'a' will be represented by 1, 'b' by 2, and so on."], "params": ["word (str): The word whose letters' positions will be plotted.", "Should contain only lowercase alphabetic characters."], "returns": ["Axes: A matplotlib.axes._subplots.Axes object representing the generated plot."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": ["Examples:", ">>> ax = f_778('abc')", ">>> ax = f_778('hello')"]}} {"task_id": "f_764", "prompt": "import pandas as pd\nimport re\n\n# Constants\nSTOPWORDS = set([\n \"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \"yours\", \"yourself\",\n \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \"hers\", \"herself\", \"it\", \"its\", \"itself\",\n \"they\", \"them\", \"their\", \"theirs\", \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\",\n \"these\", \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\",\n \"having\", \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \"because\",\n \"as\", \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"about\", \"against\", \"between\", \"into\",\n \"through\", \"during\", \"before\", \"after\", \"above\", \"below\", \"to\", \"from\", \"up\", \"down\", \"in\", \"out\",\n \"on\", \"off\", \"over\", \"under\", \"again\", \"further\", \"then\", \"once\", \"here\", \"there\", \"when\", \"where\",\n \"why\", \"how\", \"all\", \"any\", \"both\", \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\",\n \"nor\", \"not\", \"only\", \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\",\n \"don\", \"should\", \"now\"\n])\n\ndef f_764(df, column):\n \"\"\"\n Removes English stopwords from a text column in a DataFrame and returns the modified DataFrame.\n \n Parameters:\n df (pandas.DataFrame): The DataFrame containing the text column to be processed.\n column (str): The name of the text column from which stopwords should be removed.\n \n Returns:\n pandas.DataFrame: A DataFrame with the stopwords removed from the specified column.\n \n Requirements:\n - pandas\n - re\n \n Constants:\n - STOPWORDS: A set containing common English stopwords.\n \n Example:\n >>> df = pd.DataFrame({'text': ['This is a sample sentence.', 'Another example here.']})\n >>> print(f_764(df, 'text'))\n text\n 0 sample sentence\n 1 Another example\n \"\"\"", "canonical_solution": " df[column] = df[column].apply(lambda x: ' '.join([word for word in re.findall(r'\\b\\w+\\b', x) if word.lower() not in STOPWORDS]))\n return df", "test": "import unittest\nimport pandas as pd\n# Import the refined function\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'text': ['This is a sample sentence.', 'Another example here.']})\n expected_df = pd.DataFrame({'text': ['sample sentence', 'Another example']})\n result_df = f_764(df, 'text')\n pd.testing.assert_frame_equal(result_df, expected_df)\n def test_case_2(self):\n df = pd.DataFrame({'content': ['Stopwords should be removed.', 'Testing this function.']})\n expected_df = pd.DataFrame({'content': ['Stopwords removed', 'Testing function']})\n result_df = f_764(df, 'content')\n pd.testing.assert_frame_equal(result_df, expected_df)\n def test_case_3(self):\n df = pd.DataFrame({'sentence': ['Hello world!', 'Good morning.']})\n expected_df = pd.DataFrame({'sentence': ['Hello world', 'Good morning']})\n result_df = f_764(df, 'sentence')\n pd.testing.assert_frame_equal(result_df, expected_df)\n def test_case_4(self):\n df = pd.DataFrame({'text': ['This is a single sentence.'] * 100})\n expected_df = pd.DataFrame({'text': ['single sentence'] * 100})\n result_df = f_764(df, 'text')\n pd.testing.assert_frame_equal(result_df, expected_df)\n def test_case_5(self):\n df = pd.DataFrame({'line': [''] * 50})\n expected_df = pd.DataFrame({'line': [''] * 50})\n result_df = f_764(df, 'line')\n pd.testing.assert_frame_equal(result_df, expected_df)", "apis": ["re.findall"], "libs": ["re"], "doc": {"description": ["Removes English stopwords from a text column in a DataFrame and returns the modified DataFrame.", "Constants:", "- STOPWORDS: A set containing common English stopwords."], "note": [], "params": ["df (pandas.DataFrame): The DataFrame containing the text column to be processed.", "column (str): The name of the text column from which stopwords should be removed."], "returns": ["pandas.DataFrame: A DataFrame with the stopwords removed from the specified column."], "reqs": ["pandas", "re"], "raises": [], "example": [">>> df = pd.DataFrame({'text': ['This is a sample sentence.', 'Another example here.']})", ">>> print(f_764(df, 'text'))", "text", "0 sample sentence", "1 Another example"]}} -{"task_id": "f_338", "prompt": "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom scipy.stats import chi2_contingency\n\n\ndef f_338(df1, df2, column1=\"feature1\", column2=\"feature2\"):\n \"\"\"\n Merge two dataframes based on the 'id' column, perform a chi-square independence test on the merged dataframe,\n and draw a heatmap of the contingency table created from the features in column1, column2.\n\n Parameters:\n - df1 (DataFrame): Left dataframe to merge. Must contain columns 'id' and one matching column1.\n - df2 (DataFrame): Right dataframe to merge from. Must contain columns 'id' and one matching column2.\n - column1 (str): Name of column containing features in df1. Defaults to 'feature1'.\n - column2 (str): Name of column containing features in df2. Defaults to 'feature2'.\n\n Returns:\n tuple: A tuple containing:\n - p (float): The p-value of the Chi-Squared test.\n - heatmap (plt.Axes): Seaborn heatmap of the contingency table.\n\n Requirements:\n - pandas\n - seaborn\n - scipy.stats.chi2_contingency\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': ['A', 'B', 'A']})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature2': ['X', 'Y', 'X']})\n >>> p_value, heatmap = f_338(df1, df2)\n >>> p_value\n 0.6650055421020291\n >>> heatmap\n \n \"\"\"", "canonical_solution": " df = pd.merge(df1, df2, on=\"id\")\n contingency_table = pd.crosstab(df[column1], df[column2])\n heatmap = sns.heatmap(contingency_table)\n chi2, p, dof, expected = chi2_contingency(contingency_table)\n return p, heatmap", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Testing basic functionality with simple data\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [\"A\", \"B\", \"A\"]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"feature2\": [\"X\", \"Y\", \"X\"]})\n p_value, heatmap = f_338(df1, df2)\n # P-value should be between 0 and 1 inclusive\n self.assertTrue(0.0 <= p_value <= 1.0)\n self.assertEqual(len(heatmap.get_yticklabels()), 2) # A and B\n self.assertEqual(len(heatmap.get_xticklabels()), 2) # X and Y\n def test_case_2(self):\n # Testing with distinct feature values across both dataframes\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [\"C\", \"D\", \"C\"]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"feature2\": [\"W\", \"W\", \"Z\"]})\n p_value, heatmap = f_338(df1, df2)\n self.assertTrue(0.0 <= p_value <= 1.0)\n self.assertEqual(len(heatmap.get_yticklabels()), 2) # C and D\n self.assertEqual(len(heatmap.get_xticklabels()), 2) # W and Z\n def test_case_3(self):\n # Test custom feature column names\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"foo\": [\"A\", \"B\", \"A\"]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"bar\": [\"X\", \"Y\", \"X\"]})\n p_value, heatmap = f_338(df1, df2, column1=\"foo\", column2=\"bar\")\n self.assertTrue(0.0 <= p_value <= 1.0)\n self.assertEqual(len(heatmap.get_yticklabels()), 2)\n self.assertEqual(len(heatmap.get_xticklabels()), 2)\n def test_case_4(self):\n # Testing a scenario where the p-value is expected to be close to 0\n # This is because there's a strong association between feature1 and feature2\n df1 = pd.DataFrame(\n {\"id\": list(range(1, 21)), \"feature1\": [\"A\"] * 10 + [\"B\"] * 10}\n )\n df2 = pd.DataFrame(\n {\"id\": list(range(1, 21)), \"feature2\": [\"X\"] * 10 + [\"Y\"] * 10}\n )\n p_value, _ = f_338(df1, df2)\n self.assertTrue(0.0 <= p_value < 0.01) # Expected p-value to be close to 0\n def test_case_5(self):\n # Test error handling - should fail when there is no 'id' column\n df1 = pd.DataFrame({\"foo\": [1, 2], \"bar\": [3, 4]})\n df2 = pd.DataFrame({\"foo\": [1, 2], \"bar\": [3, 4]})\n with self.assertRaises(KeyError):\n f_338(df1, df2)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.crosstab", "seaborn.heatmap", "scipy.stats.chi2_contingency", "pandas.merge"], "libs": ["pandas", "seaborn", "scipy"], "doc": {"description": ["Merge two dataframes based on the 'id' column, perform a chi-square independence test on the merged dataframe,", "and draw a heatmap of the contingency table created from the features in column1, column2."], "note": [], "params": ["df1 (DataFrame): Left dataframe to merge. Must contain columns 'id' and one matching column1.", "df2 (DataFrame): Right dataframe to merge from. Must contain columns 'id' and one matching column2.", "column1 (str): Name of column containing features in df1. Defaults to 'feature1'.", "column2 (str): Name of column containing features in df2. Defaults to 'feature2'."], "returns": ["tuple: A tuple containing:", "p (float): The p-value of the Chi-Squared test.", "heatmap (plt.Axes): Seaborn heatmap of the contingency table."], "reqs": ["pandas", "seaborn", "scipy.stats.chi2_contingency"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': ['A', 'B', 'A']})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature2': ['X', 'Y', 'X']})", ">>> p_value, heatmap = f_338(df1, df2)", ">>> p_value", "0.6650055421020291", ">>> heatmap", ""]}} -{"task_id": "f_871", "prompt": "import subprocess\nimport time\nimport json\nimport platform\n\nLOGFILE_PATH = \"logfile.log\"\n\n\ndef f_871(interval, duration):\n \"\"\"\n Monitors and logs CPU usage at specified intervals over a given duration.\n\n Parameters:\n interval (int): The frequency, in seconds, at which CPU usage data is captured. Must be greater than zero.\n duration (int): The total duration, in seconds, for which CPU usage is monitored. Must be greater than zero.\n\n Returns:\n str: Path to the log file where CPU usage data is saved. Returns None if an IOError occurs during file operations.\n\n Raises:\n ValueError: If either 'interval' or 'duration' is less than or equal to zero.\n\n Requirements:\n - subprocess\n - time\n - json\n - platform\n\n Note: \n Actual run time of the function may slightly exceed the specified 'duration' due to processing time and system response delay.\n The function records the CPU usage percentage at regular intervals for a specified duration.\n The data is captured every 'interval' seconds until the 'duration' is reached or exceeded.\n Each record includes a timestamp and the CPU usage percentage at that moment.\n The data is saved in JSON format in a log file named 'logfile.log'.\n The function supports different commands for CPU usage monitoring on Windows and Unix/Linux platforms.\n \n Example:\n >>> f_871(5, 60)\n 'logfile.log'\n \"\"\"", "canonical_solution": " if interval <= 0 or duration <= 0:\n raise ValueError(\"Interval and duration must be greater than zero.\")\n\n start_time = time.time()\n try:\n with open(LOGFILE_PATH, \"w\", encoding=\"utf-8\") as logfile:\n while time.time() - start_time <= duration:\n operation_start_time = time.time()\n\n # Check the operating system\n if platform.system() == \"Windows\":\n # Windows command for CPU usage\n command = [\n \"typeperf\",\n \"\\\\Processor(_Total)\\\\% Processor Time\",\n \"-sc\",\n \"1\",\n ]\n else:\n # Unix/Linux command for CPU usage\n command = [\"top\", \"-b\", \"-n1\"]\n\n output = subprocess.check_output(command)\n cpu_usage_line = (\n output.decode(\"utf-8\").split(\"\\n\")[2]\n if platform.system() == \"Windows\"\n else output.decode(\"utf-8\").split(\"\\n\")[2]\n )\n cpu_usage = (\n cpu_usage_line.split(\",\")[-1].strip().replace('\"', \"\")\n if platform.system() == \"Windows\"\n else cpu_usage_line.split(\":\")[1].split(\",\")[0].strip()\n )\n\n log_data = {\"timestamp\": time.time(), \"cpu_usage\": cpu_usage}\n json.dump(log_data, logfile)\n logfile.write(\"\\n\")\n\n # Adjust sleep time\n sleep_time = max(0, interval - (time.time() - operation_start_time))\n time.sleep(sleep_time)\n except IOError as e:\n print(f\"Error writing to file {LOGFILE_PATH}: {e}\")\n return None\n\n return LOGFILE_PATH", "test": "import unittest\nimport os\nimport json\nfrom unittest.mock import patch\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_871.\"\"\"\n def setUp(self):\n \"\"\"\n Setup before each test case.\n \"\"\"\n self.logfile_path = \"logfile.log\"\n def tearDown(self):\n \"\"\"\n Cleanup after each test case.\n \"\"\"\n if os.path.exists(self.logfile_path):\n os.remove(self.logfile_path)\n @patch(\"time.time\")\n def test_normal_operation(self, mock_time):\n \"\"\"\n Test the normal operation of the function.\n It should create a log file with the expected content.\n \"\"\"\n # Create an iterator that starts at 0 and increments by 5 every time it's called\n time_iter = iter(range(0, 100, 5))\n mock_time.side_effect = lambda: next(time_iter)\n result = f_871(5, 25)\n self.assertEqual(result, self.logfile_path)\n self.assertTrue(os.path.exists(self.logfile_path))\n def test_invalid_interval(self):\n \"\"\"\n Test the function with an invalid interval value (less than or equal to zero).\n It should raise a ValueError.\n \"\"\"\n with self.assertRaises(ValueError):\n f_871(-1, 10)\n def test_invalid_duration(self):\n \"\"\"\n Test the function with an invalid duration value (less than or equal to zero).\n It should raise a ValueError.\n \"\"\"\n with self.assertRaises(ValueError):\n f_871(5, -10)\n @patch(\"subprocess.check_output\")\n @patch(\"time.time\")\n @patch(\"platform.system\")\n def test_subprocess_output_handling_windows(\n self, mock_platform, mock_time, mock_subprocess\n ):\n \"\"\"\n Test handling of subprocess output on Windows.\n It should correctly parse the CPU usage from the subprocess output.\n \"\"\"\n mock_platform.return_value = \"Windows\"\n mock_time.side_effect = iter(range(0, 100, 5))\n mock_output = b'\"\\\\Processor(_Total)\\\\% Processor Time\",\"5.0\"\\n\\n\"2023-04-01 12:34:56.789\",\"5.0\"\\n'\n mock_subprocess.return_value = mock_output\n result = f_871(5, 10)\n self.assertEqual(result, self.logfile_path)\n @patch(\"subprocess.check_output\")\n @patch(\"time.time\")\n @patch(\"platform.system\")\n def test_subprocess_output_handling_linux(\n self, mock_platform, mock_time, mock_subprocess\n ):\n \"\"\"\n Test handling of subprocess output on Linux.\n It should correctly parse the CPU usage from the subprocess output.\n \"\"\"\n mock_platform.return_value = \"Linux\"\n mock_time.side_effect = iter(range(0, 100, 5))\n mock_output = b\"Linux 4.15.0-54-generic (ubuntu) \\nTasks: 195 total...\\n%Cpu(s): 5.0 us, 2.0 sy, 0.0 ni, 92.0 id, 0.0 wa, 0.0 hi, 1.0 si, 0.0 st\\n\"\n mock_subprocess.return_value = mock_output\n result = f_871(5, 10)\n self.assertEqual(result, self.logfile_path)\n @patch(\"builtins.open\", side_effect=IOError(\"Mocked error\"))\n def test_io_error_handling(self, mock_open):\n \"\"\"\n Test the function's behavior when an IOError occurs during file operations.\n It should handle the error and return None.\n \"\"\"\n result = f_871(5, 10)\n self.assertIsNone(result)", "apis": ["subprocess.check_output", "platform.system", "time.time", "time.sleep", "json.dump"], "libs": ["subprocess", "json", "platform", "time"], "doc": {"description": ["Monitors and logs CPU usage at specified intervals over a given duration."], "note": ["Actual run time of the function may slightly exceed the specified 'duration' due to processing time and system response delay.", "The function records the CPU usage percentage at regular intervals for a specified duration.", "The data is captured every 'interval' seconds until the 'duration' is reached or exceeded.", "Each record includes a timestamp and the CPU usage percentage at that moment.", "The data is saved in JSON format in a log file named 'logfile.log'.", "The function supports different commands for CPU usage monitoring on Windows and Unix/Linux platforms."], "params": ["interval (int): The frequency, in seconds, at which CPU usage data is captured. Must be greater than zero.", "duration (int): The total duration, in seconds, for which CPU usage is monitored. Must be greater than zero."], "returns": ["str: Path to the log file where CPU usage data is saved. Returns None if an IOError occurs during file operations."], "reqs": ["subprocess", "time", "json", "platform"], "raises": ["ValueError: If either 'interval' or 'duration' is less than or equal to zero."], "example": [">>> f_871(5, 60)", "'logfile.log'"]}} -{"task_id": "f_390", "prompt": "from datetime import datetime\nimport random\nimport matplotlib.pyplot as plt\n\n\ndef f_390(\n epoch_milliseconds,\n teams=[\"Team1\", \"Team2\", \"Team3\", \"Team4\", \"Team5\"],\n random_seed=0,\n):\n \"\"\"\n Generate and plot a performance trend for different teams from a given epoch timestamp to the current time.\n\n The performance data is generated by creating a series of random values for each day from the starting timestamp\n to the present day. Each team's performance is simulated as a random float between 0.1 and 1 for each day.\n The plot shows days since the start date on the x-axis and performance on the y-axis.\n\n Parameters:\n epoch_milliseconds (int): The epoch milliseconds from where to start the generation. Must not be in the future.\n teams (list of str, optional): Team names. If not provided, defaults to ['Team1', 'Team2', 'Team3', 'Team4', 'Team5'].\n random_seed (int, optional): Seed for random number generation to ensure reproducibility. Defaults to 0.\n\n Returns:\n dict: A dictionary containing performance data for each team, with days as indices and performance as float values.\n matplotlib.figure.Figure: A figure object showing the performance trend of each team over the days.\n\n Requirements:\n - datetime.datetime\n - random\n - matplotlib\n\n Example:\n >>> results, ax = f_390(1236472051807)\n >>> results.keys()\n dict_keys(['Team1', 'Team2', 'Team3', 'Team4', 'Team5'])\n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n random.seed(random_seed)\n\n if (not isinstance(teams, list)) or (not all(isinstance(t, str) for t in teams)):\n raise TypeError(\"Expected teams to be list of str\")\n\n start_time = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n current_time = datetime.now()\n days_diff = (current_time - start_time).days\n\n if days_diff < 0:\n raise ValueError(\"Input epoch timestamp is in the future!\")\n\n performance_data = {team: [0] * days_diff for team in teams}\n\n for i in range(days_diff):\n for team in teams:\n performance = random.uniform(0.1, 1)\n performance_data[team][i] += performance\n\n fig, ax = plt.subplots()\n for team, performance in performance_data.items():\n ax.plot(range(days_diff), performance, label=team)\n\n ax.set_xlabel(\"Days since \" + start_time.strftime(\"%Y-%m-%d %H:%M:%S\"))\n ax.set_ylabel(\"Performance\")\n ax.legend()\n\n return performance_data, fig", "test": "import unittest\nfrom datetime import datetime, timedelta\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.x = 1631295600000\n self.default_valid_teams = [\"Team1\", \"Team2\", \"Team3\", \"Team4\", \"Team5\"]\n def _check_valid_performance_data(self, performance_data, valid_teams):\n self.assertIsInstance(performance_data, dict)\n self.assertTrue(all(team in valid_teams for team in performance_data.keys()))\n for team, performances in performance_data.items():\n for performance in performances:\n self.assertTrue(\n 0.1 <= performance <= 1, f\"Performance out of range for {team}\"\n )\n self.assertIsInstance(performance, float)\n def _check_plot(self, fig):\n ax = fig.axes[0]\n self.assertIsInstance(fig, plt.Figure)\n self.assertEqual(ax.get_ylabel(), \"Performance\")\n self.assertTrue(ax.get_xlabel().startswith(\"Days since\"))\n def test_case_1(self):\n # Test basic case with default parameters - data\n performance_data, _ = f_390(self.x)\n self._check_valid_performance_data(performance_data, self.default_valid_teams)\n def test_case_2(self):\n # Test basic case with default parameters - plot\n _, fig = f_390(self.x)\n self._check_plot(fig)\n def test_case_3(self):\n # Test basic case with custom input\n performance_data, fig = f_390(1236472051807, random_seed=42)\n self._check_plot(fig)\n self._check_valid_performance_data(performance_data, self.default_valid_teams)\n def test_case_4(self):\n # Test custom parameters - custom teams\n for custom_teams in [[\"A\", \"B\"], [\"c d e\", \"F\", \"GH\", \"ij kl\"]]:\n performance_data, fig = f_390(self.x, teams=custom_teams, random_seed=42)\n self._check_plot(fig)\n self._check_valid_performance_data(performance_data, custom_teams)\n def test_case_5(self):\n # Test custom parameters - random seed\n performance_data1, _ = f_390(self.x, random_seed=42)\n performance_data2, _ = f_390(self.x, random_seed=42)\n performance_data3, _ = f_390(self.x, random_seed=0)\n self.assertEqual(performance_data1, performance_data2)\n self.assertNotEqual(performance_data1, performance_data3)\n def test_case_6(self):\n # Test error handling for invalid input time\n future_epoch = int((datetime.now() + timedelta(days=1)).timestamp() * 1000)\n with self.assertRaises(ValueError):\n f_390(future_epoch)\n def test_case_7(self):\n # Test error handling for invalid team\n with self.assertRaises(TypeError):\n f_390(self.x, [1, 2, 3])\n with self.assertRaises(TypeError):\n f_390(self.x, [[]])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["random.seed", "datetime.datetime.now", "random.uniform", "datetime.datetime.fromtimestamp", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "random", "datetime"], "doc": {"description": ["Generate and plot a performance trend for different teams from a given epoch timestamp to the current time.", "The performance data is generated by creating a series of random values for each day from the starting timestamp", "to the present day. Each team's performance is simulated as a random float between 0.1 and 1 for each day.", "The plot shows days since the start date on the x-axis and performance on the y-axis."], "note": [], "params": ["epoch_milliseconds (int): The epoch milliseconds from where to start the generation. Must not be in the future.", "teams (list of str, optional): Team names. If not provided, defaults to ['Team1', 'Team2', 'Team3', 'Team4', 'Team5'].", "random_seed (int, optional): Seed for random number generation to ensure reproducibility. Defaults to 0."], "returns": ["dict: A dictionary containing performance data for each team, with days as indices and performance as float values.", "matplotlib.figure.Figure: A figure object showing the performance trend of each team over the days."], "reqs": ["datetime.datetime", "random", "matplotlib"], "raises": [], "example": [">>> results, ax = f_390(1236472051807)", ">>> results.keys()", "dict_keys(['Team1', 'Team2', 'Team3', 'Team4', 'Team5'])", ">>> type(ax)", ""]}} -{"task_id": "f_759", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib.axes import Axes\nfrom statsmodels.tsa.arima.model import ARIMA\nfrom typing import List, Tuple\n\ndef f_759(df: pd.DataFrame) -> Tuple[List[float], Axes]:\n \"\"\"\n Forecasts the share closing prices for the next 7 days using the ARIMA model and plots the forecast.\n\n Parameters:\n df (pd.DataFrame): The input dataframe with columns 'date' and 'closing_price'. \n 'date' should be of datetime dtype and 'closing_price' should be float.\n\n Returns:\n Tuple[List[float], Axes]: A tuple containing:\n - A list with forecasted prices for the next 7 days.\n - A matplotlib Axes object containing the plot.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n - statsmodels.tsa.arima.model.ARIMA\n\n Example:\n >>> df = pd.DataFrame({\n ... 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),\n ... 'closing_price': [100, 101, 102, 103, 104, 105, 106]\n ... })\n >>> forecast, ax = f_759(df)\n >>> print(forecast)\n [106.99999813460752, 107.99999998338443, 108.99999547091295, 109.99999867405204, 110.99999292499156, 111.99999573455818, 112.9999903188028]\n \"\"\"", "canonical_solution": " # Creating the ARIMA model\n model = ARIMA(df['closing_price'], order=(5, 1, 0))\n model_fit = model.fit()\n \n # Forecasting the next 7 days\n forecast = model_fit.forecast(steps=7)\n # Plotting the forecast\n fig, ax = plt.subplots()\n ax.plot(df['date'], df['closing_price'], label='Historical Closing Prices')\n forecast_dates = pd.date_range(start=df['date'].iloc[-1] + pd.Timedelta(days=1), periods=7)\n ax.plot(forecast_dates, forecast, label='Forecasted Closing Prices')\n ax.legend()\n \n return forecast.tolist(), ax", "test": "# Importing required modules for testing\nimport unittest\nimport pandas as pd\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n # Creating a sample dataframe with closing prices for 7 days\n df1 = pd.DataFrame({\n 'date': pd.date_range(start='2022-01-01', end='2022-01-07', freq='D'),\n 'closing_price': [100, 101, 102, 103, 104, 105, 106]\n })\n \n # Running the function\n forecast1, ax1 = f_759(df1)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast1, list)\n self.assertIsInstance(ax1, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast1, [106.99999813460752, 107.99999998338443, 108.99999547091295, 109.99999867405204, 110.99999292499156, 111.99999573455818, 112.9999903188028]):\n self.assertAlmostEqual(a, b, places=3)\n \n # Checking if the plot contains data\n lines = ax1.get_lines()\n self.assertTrue(lines[0].get_ydata().tolist(), [100, 101, 102, 103, 104, 105, 106])\n def test_case_2(self):\n # Creating a sample dataframe with closing prices for 7 days\n df2 = pd.DataFrame({\n 'date': pd.date_range(start='2022-02-01', end='2022-02-07', freq='D'),\n 'closing_price': [200, 201, 202, 203, 204, 205, 206]\n })\n \n # Running the function\n forecast2, ax2 = f_759(df2)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast2, list)\n self.assertIsInstance(ax2, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast2, [206.9999997816766, 208.00000005262595, 208.99999941300158, 210.000000028273, 210.99999903094576, 211.99999982088116, 212.99999869216418]):\n self.assertAlmostEqual(a, b, places=3)\n # Checking if the plot contains data\n lines = ax2.get_lines()\n self.assertAlmostEqual(lines[0].get_ydata().tolist(), [200, 201, 202, 203, 204, 205, 206])\n def test_case_3(self):\n # Creating a sample dataframe with closing prices for 7 days\n df3 = pd.DataFrame({\n 'date': pd.date_range(start='2022-03-01', end='2022-03-07', freq='D'),\n 'closing_price': [300, 301, 302, 303, 304, 305, 306]\n })\n \n # Running the function\n forecast3, ax3 = f_759(df3)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast3, list)\n self.assertIsInstance(ax3, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast3, [306.99999853839176, 308.00000003237324, 308.9999964108992, 309.9999991004857, 310.9999943724899, 311.9999968807911, 312.99999233933994]):\n self.assertAlmostEqual(a, b, places=3)\n # Checking if the plot contains data\n lines = ax3.get_lines()\n # get data from the line\n self.assertAlmostEqual(lines[0].get_ydata().tolist(), [300, 301, 302, 303, 304, 305, 306])\n def test_case_4(self):\n # Creating a sample dataframe with closing prices for 7 days\n df4 = pd.DataFrame({\n 'date': pd.date_range(start='2022-04-01', end='2022-04-07', freq='D'),\n 'closing_price': [400, 401, 402, 403, 404, 405, 406]\n })\n \n # Running the function\n forecast4, ax4 = f_759(df4)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast4, list)\n self.assertIsInstance(ax4, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast4, [406.99999936259456, 408.0000000781549, 408.99999837145054, 409.9999998156926, 410.9999973988557, 411.99999898892963, 412.9999964967954]):\n self.assertAlmostEqual(a, b, places=3)\n # Checking if the plot contains data\n lines = ax4.get_lines()\n self.assertAlmostEqual(lines[0].get_ydata().tolist(), [400, 401, 402, 403, 404, 405, 406])\n def test_case_5(self):\n # Creating a sample dataframe with closing prices for 7 days\n df5 = pd.DataFrame({\n 'date': pd.date_range(start='2022-05-01', end='2022-05-07', freq='D'),\n 'closing_price': [500, 501, 502, 503, 504, 505, 506]\n })\n \n # Running the function\n forecast5, ax5 = f_759(df5)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast5, list)\n self.assertIsInstance(ax5, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast5, [506.99999853029163, 508.0000000310427, 508.99999639197796, 509.9999990913683, 510.9999943427388, 511.9999968573493, 512.9999922971087]):\n self.assertAlmostEqual(a, b, places=3)\n # Checking if the plot contains data\n lines = ax5.get_lines()\n self.assertTrue(lines[0].get_ydata().tolist(), [500, 501, 502, 503, 504, 505, 506])", "apis": ["pandas.Timedelta", "pandas.date_range", "statsmodels.tsa.arima.model.ARIMA", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas", "statsmodels"], "doc": {"description": ["Forecasts the share closing prices for the next 7 days using the ARIMA model and plots the forecast."], "note": [], "params": ["df (pd.DataFrame): The input dataframe with columns 'date' and 'closing_price'.", "'date' should be of datetime dtype and 'closing_price' should be float."], "returns": ["Tuple[List[float], Axes]: A tuple containing:", "A list with forecasted prices for the next 7 days.", "A matplotlib Axes object containing the plot."], "reqs": ["pandas", "numpy", "matplotlib.pyplot", "statsmodels.tsa.arima.model.ARIMA"], "raises": [], "example": [">>> df = pd.DataFrame({", "... 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),", "... 'closing_price': [100, 101, 102, 103, 104, 105, 106]", "... })", ">>> forecast, ax = f_759(df)", ">>> print(forecast)", "[106.99999813460752, 107.99999998338443, 108.99999547091295, 109.99999867405204, 110.99999292499156, 111.99999573455818, 112.9999903188028]"]}} -{"task_id": "f_333", "prompt": "import pandas as pd\nfrom sklearn.model_selection import train_test_split\n\n\ndef f_333(df, target_column, column_to_remove=\"c\", test_size=0.2):\n \"\"\"\n Split the data into train and test datasets after removing a specified column if it exists.\n\n Parameters:\n - df (dict): The input dataframe.\n - target_column (str): The name of the target column.\n - column_to_remove (str): The name of the column to remove. Defaults to 'c'.\n - test_size (float): The ratio of test data in split output. Defaults to .2.\n\n Returns:\n - X_train (pd.DataFrame): Split features for training.\n - X_test (pd.DataFrame): Split features for testing.\n - y_train (pd.Series): Split target values for training.\n - y_test (pd.Series): Split target values for testing.\n\n Requirements:\n - pandas\n - sklearn\n\n Examples:\n >>> data = {\n ... 'a': [1, 2, 3, 4],\n ... 'b': [5, 6, 7, 8],\n ... 'c': [9, 10, 11, 12],\n ... 'target': [0, 1, 0, 1]\n ... }\n >>> X_train, _, _, _ = f_333(data, 'target')\n >>> type(X_train), X_train.shape\n (, (3, 2))\n >>> data = {\n ... 'x1': [10, 20, 30, 40],\n ... 'x2': [50, 60, 70, 80],\n ... 'x3': [90, 100, 110, 120],\n ... 'outcome': [1, 2, 3, 4]\n ... }\n >>> df2 = pd.DataFrame(data)\n >>> _, _, _, y_test = f_333(df2, 'outcome', 'x3', .25)\n >>> type(y_test), y_test.shape\n (, (1,))\n \"\"\"", "canonical_solution": " df = pd.DataFrame(df)\n # Drop the specified column if it exists in the dataframe\n if column_to_remove in df.columns:\n df = df.drop(columns=column_to_remove)\n\n # Split the dataframe into training and test datasets\n X_train, X_test, y_train, y_test = train_test_split(\n df.drop(columns=target_column), df[target_column], test_size=test_size\n )\n\n return X_train, X_test, y_train, y_test", "test": "import unittest\nimport pandas as pd\nfrom sklearn.utils._param_validation import InvalidParameterError\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # basic test dataframe\n self.df = {\"a\": [1, 2, 3, 4, 5], \"b\": [4, 5, 6, 7, 8], \"c\": [7, 8, 9, 10, 11]}\n def shape_testing_helper(self, expected_train_len, expected_test_len, split_data):\n X_train, X_test, y_train, y_test = split_data\n self.assertTrue(len(X_train) == expected_train_len)\n self.assertTrue(len(y_train) == expected_train_len)\n self.assertTrue(len(X_test) == expected_test_len)\n self.assertTrue(len(y_test) == expected_test_len)\n def test_case_1(self):\n # Dataframe with a 'c' column to be removed\n X_train, X_test, y_train, y_test = f_333(self.df, \"b\")\n self.assertEqual(\"a\", X_train.columns[0])\n self.assertEqual(\"b\", y_train.name)\n self.assertNotIn(\"c\", X_train.columns)\n self.shape_testing_helper(4, 1, (X_train, X_test, y_train, y_test))\n def test_case_2(self):\n # Specify removal of separate column\n X_train, X_test, y_train, y_test = f_333(self.df, \"a\", column_to_remove=\"b\")\n self.assertEqual(\"c\", X_train.columns[0])\n self.assertEqual(\"a\", y_train.name)\n self.assertNotIn(\"b\", X_train.columns)\n self.shape_testing_helper(4, 1, (X_train, X_test, y_train, y_test))\n def test_case_3(self):\n # Dataframe doesn't have column to be removed\n X_train, X_test, y_train, y_test = f_333(self.df, \"a\", column_to_remove=\"FOO\")\n self.assertEqual(\"a\", y_train.name)\n self.assertIn(\"b\", X_train.columns)\n self.assertIn(\"c\", X_train.columns)\n self.shape_testing_helper(4, 1, (X_train, X_test, y_train, y_test))\n def test_case_4(self):\n # Change testing ratio\n X_train, X_test, y_train, y_test = f_333(self.df, \"a\", test_size=0.8)\n self.shape_testing_helper(1, 4, (X_train, X_test, y_train, y_test))\n def test_case_5(self):\n # Should fail if specify invalid ratio\n with self.assertRaises(InvalidParameterError):\n f_333(self.df, \"a\", test_size=-999)\n with self.assertRaises(InvalidParameterError):\n f_333(self.df, \"a\", test_size=\"foo\")\n def test_case_6(self):\n # Testing with a dataframe having mixed data types\n df = {\n \"a\": [pd.NA, 2.3, 3.4, 4.5, 5.5],\n \"b\": [\"one\", \"two\", pd.NA, \"four\", \"five\"],\n \"c\": [True, False, True, False, pd.NA],\n }\n X_train, X_test, y_train, y_test = f_333(df, \"b\")\n self.assertNotIn(\"c\", X_train.columns)\n self.shape_testing_helper(4, 1, (X_train, X_test, y_train, y_test))", "apis": ["sklearn.model_selection.train_test_split", "pandas.DataFrame"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Split the data into train and test datasets after removing a specified column if it exists."], "note": [], "params": ["df (dict): The input dataframe.", "target_column (str): The name of the target column.", "column_to_remove (str): The name of the column to remove. Defaults to 'c'.", "test_size (float): The ratio of test data in split output. Defaults to .2."], "returns": ["X_train (pd.DataFrame): Split features for training.", "X_test (pd.DataFrame): Split features for testing.", "y_train (pd.Series): Split target values for training.", "y_test (pd.Series): Split target values for testing."], "reqs": ["pandas", "sklearn"], "raises": [], "example": ["Examples:", ">>> data = {", "... 'a': [1, 2, 3, 4],", "... 'b': [5, 6, 7, 8],", "... 'c': [9, 10, 11, 12],", "... 'target': [0, 1, 0, 1]", "... }", ">>> X_train, _, _, _ = f_333(data, 'target')", ">>> type(X_train), X_train.shape", "(, (3, 2))", ">>> data = {", "... 'x1': [10, 20, 30, 40],", "... 'x2': [50, 60, 70, 80],", "... 'x3': [90, 100, 110, 120],", "... 'outcome': [1, 2, 3, 4]", "... }", ">>> df2 = pd.DataFrame(data)", ">>> _, _, _, y_test = f_333(df2, 'outcome', 'x3', .25)", ">>> type(y_test), y_test.shape", "(, (1,))"]}} -{"task_id": "f_756", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\n\ndef f_756(df):\n \"\"\"\n Predicts the stock closing prices for the next 7 days using simple linear regression and plots the data.\n\n Parameters:\n df (DataFrame): The input dataframe with columns 'date' and 'closing_price'. 'date' should be in datetime format.\n\n Returns:\n tuple: A tuple containing:\n - list: A list with predicted prices for the next 7 days.\n - Axes: The matplotlib Axes object containing the plot.\n \n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n - sklearn.linear_model.LinearRegression\n\n Constants:\n - The function uses a constant time step of 24*60*60 seconds to generate future timestamps.\n\n Example:\n >>> df = pd.DataFrame({\n ... 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),\n ... 'closing_price': [100, 101, 102, 103, 104, 105, 106]\n ... })\n >>> pred_prices, plot = f_756(df)\n >>> print(pred_prices)\n [107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0]\n \"\"\"", "canonical_solution": " # Convert date to timestamp\n df['date'] = pd.to_datetime(df['date'])\n df['date'] = df['date'].map(pd.Timestamp.timestamp)\n \n # Prepare data\n X = df['date'].values.reshape(-1, 1)\n y = df['closing_price'].values\n \n # Fit model\n model = LinearRegression()\n model.fit(X, y)\n \n # Predict future prices\n future_dates = np.array([df['date'].max() + i*24*60*60 for i in range(1, 8)]).reshape(-1, 1)\n pred_prices = model.predict(future_dates)\n \n # Plot\n fig, ax = plt.subplots()\n ax.scatter(df['date'], df['closing_price'], color='black')\n ax.plot(future_dates, pred_prices, color='blue', linewidth=3)\n \n return pred_prices.tolist(), ax", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),\n 'closing_price': [100, 101, 102, 103, 104, 105, 106]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')\n def test_case_2(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='2/1/2021', end='2/7/2021'),\n 'closing_price': [200, 201, 202, 203, 204, 205, 206]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [207.0, 208.0, 209.0, 210.0, 211.0, 212.0, 213.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')\n def test_case_3(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='3/1/2021', end='3/7/2021'),\n 'closing_price': [300, 301, 302, 303, 304, 305, 306]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [307.0, 308.0, 309.0, 310.0, 311.0, 312.0, 313.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')\n def test_case_4(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='4/1/2021', end='4/7/2021'),\n 'closing_price': [400, 401, 402, 403, 404, 405, 406]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [407.0, 408.0, 409.0, 410.0, 411.0, 412.0, 413.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')\n def test_case_5(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='5/1/2021', end='5/7/2021'),\n 'closing_price': [500, 501, 502, 503, 504, 505, 506]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [507.0, 508.0, 509.0, 510.0, 511.0, 512.0, 513.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')", "apis": ["pandas.to_datetime", "sklearn.linear_model.LinearRegression", "pandas.Timestamp", "numpy.array", "matplotlib.pyplot.subplots"], "libs": ["numpy", "pandas", "sklearn", "matplotlib"], "doc": {"description": ["Predicts the stock closing prices for the next 7 days using simple linear regression and plots the data.", "Constants:", "- The function uses a constant time step of 24*60*60 seconds to generate future timestamps."], "note": [], "params": ["df (DataFrame): The input dataframe with columns 'date' and 'closing_price'. 'date' should be in datetime format."], "returns": ["tuple: A tuple containing:", "list: A list with predicted prices for the next 7 days.", "Axes: The matplotlib Axes object containing the plot."], "reqs": ["pandas", "numpy", "matplotlib.pyplot", "sklearn.linear_model.LinearRegression"], "raises": [], "example": [">>> df = pd.DataFrame({", "... 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),", "... 'closing_price': [100, 101, 102, 103, 104, 105, 106]", "... })", ">>> pred_prices, plot = f_756(df)", ">>> print(pred_prices)", "[107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0]"]}} -{"task_id": "f_860", "prompt": "import pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report\n\n\ndef f_860(csv_file_path, target_column=\"target\", test_size=0.2, n_estimators=100):\n \"\"\"\n Processes a CSV file to train a Random Forest classifier and generates a formatted classification report.\n\n Parameters:\n csv_file_path (str): The path to the CSV file containing the data.\n target_column (str, optional): The name of the target variable column. Defaults to 'target'.\n test_size (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.2.\n n_estimators (int, optional): The number of trees in the RandomForestClassifier. Defaults to 100.\n\n Returns:\n str: A formatted classification report. The report includes metrics such as precision, recall,\n f1-score for each class, as well as overall accuracy, macro average, and weighted average.\n\n Raises:\n ValueError: If the specified target_column is not found in the CSV file.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> report = f_860('/path/to/data.csv')\n >>> print(report)\n class 0 0.88 0.90 0.89 50\n class 1 0.89 0.87 0.88 48\n ...\n accuracy 0.89 100\n macro avg 0.88 0.89 0.88 100\n weighted avg 0.89 0.89 0.89 100\n\n Note:\n The CSV file must have a column with the name specified by 'target_column', and it should be in a\n format readable by pandas.read_csv().\n \"\"\"", "canonical_solution": " df = pd.read_csv(csv_file_path)\n if target_column not in df.columns:\n raise ValueError(f\"'{target_column}' column not found in the CSV file.\")\n\n X = df.drop(target_column, axis=1)\n y = df[target_column]\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=test_size, random_state=42\n )\n clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)\n clf.fit(X_train, y_train)\n y_pred = clf.predict(X_test)\n report = classification_report(y_test, y_pred)\n\n # New formatting approach\n lines = report.split(\"\\n\")\n formatted_lines = []\n for line in lines:\n # Split the line into words and rejoin with specific spacing\n parts = line.split()\n if len(parts) == 5: # Class-specific metrics\n formatted_line = f\"{parts[0]:<15}{parts[1]:>10}{parts[2]:>10}{parts[3]:>10}{parts[4]:>10}\"\n elif len(parts) == 4: # Overall metrics\n formatted_line = f\"{parts[0]:<15}{parts[1]:>10}{parts[2]:>10}{parts[3]:>10}\"\n else:\n formatted_line = line # Header or empty lines\n formatted_lines.append(formatted_line)\n\n formatted_report = \"\\n\".join(formatted_lines)\n return formatted_report", "test": "import unittest\nfrom unittest.mock import patch\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_860.\"\"\"\n @patch(\"pandas.read_csv\")\n def test_default_parameters(self, mock_read_csv):\n \"\"\"\n Test f_860 with default parameters using an adequately sized mock dataset.\n \"\"\"\n mock_data = {\n \"feature1\": range(100),\n \"feature2\": range(100, 200),\n \"target\": [0, 1] * 50, # Alternating 0s and 1s\n }\n mock_read_csv.return_value = pd.DataFrame(mock_data)\n result = f_860(\"dummy_path.csv\")\n self.assertIn(\"precision\", result)\n @patch(\"pandas.read_csv\")\n def test_non_default_target_column(self, mock_read_csv):\n \"\"\"\n Test f_860 with a non-default target column using a larger mock dataset.\n \"\"\"\n mock_data = {\n \"feature1\": range(100),\n \"feature2\": range(100, 200),\n \"label\": [1, 0] * 50, # Alternating 1s and 0s\n }\n mock_read_csv.return_value = pd.DataFrame(mock_data)\n result = f_860(\"dummy_path.csv\", target_column=\"label\")\n self.assertIn(\"precision\", result)\n @patch(\"pandas.read_csv\")\n def test_different_test_size(self, mock_read_csv):\n \"\"\"\n Test f_860 with a different test size and a larger dataset.\n \"\"\"\n mock_data = {\n \"feature1\": range(100),\n \"feature2\": range(100, 200),\n \"target\": [0, 1, 1, 0] * 25, # Repeated pattern\n }\n mock_read_csv.return_value = pd.DataFrame(mock_data)\n result = f_860(\"dummy_path.csv\", test_size=0.5)\n self.assertIn(\"precision\", result)\n @patch(\"pandas.read_csv\")\n def test_different_n_estimators(self, mock_read_csv):\n \"\"\"\n Test f_860 with a different number of estimators and an expanded dataset.\n \"\"\"\n mock_data = {\n \"feature1\": range(100),\n \"feature2\": range(100, 200),\n \"target\": [1, 0] * 50, # Alternating 1s and 0s\n }\n mock_read_csv.return_value = pd.DataFrame(mock_data)\n result = f_860(\"dummy_path.csv\", n_estimators=50)\n self.assertIn(\"precision\", result)\n @patch(\"pandas.read_csv\")\n def test_missing_target_column(self, mock_read_csv):\n \"\"\"\n Test f_860 with a missing target column.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame(\n {\"feature1\": [1, 2], \"feature2\": [3, 4]}\n )\n with self.assertRaises(ValueError):\n f_860(\"dummy_path.csv\", target_column=\"not_exist\")", "apis": ["sklearn.model_selection.train_test_split", "sklearn.metrics.classification_report", "pandas.read_csv", "sklearn.ensemble.RandomForestClassifier"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Processes a CSV file to train a Random Forest classifier and generates a formatted classification report."], "note": ["The CSV file must have a column with the name specified by 'target_column', and it should be in a", "format readable by pandas.read_csv()."], "params": ["csv_file_path (str): The path to the CSV file containing the data.", "target_column (str, optional): The name of the target variable column. Defaults to 'target'.", "test_size (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.2.", "n_estimators (int, optional): The number of trees in the RandomForestClassifier. Defaults to 100."], "returns": ["str: A formatted classification report. The report includes metrics such as precision, recall,", "f1-score for each class, as well as overall accuracy, macro average, and weighted average."], "reqs": ["pandas", "sklearn"], "raises": ["ValueError: If the specified target_column is not found in the CSV file."], "example": [">>> report = f_860('/path/to/data.csv')", ">>> print(report)", "class 0 0.88 0.90 0.89 50", "class 1 0.89 0.87 0.88 48", "...", "accuracy 0.89 100", "macro avg 0.88 0.89 0.88 100", "weighted avg 0.89 0.89 0.89 100"]}} -{"task_id": "f_832", "prompt": "import random\nimport string\n\n\ndef f_832(length: int, predicates: list, seed: int = None):\n \"\"\"\n Generates a random string of specified length and evaluates it for specific characteristics.\n\n Parameters:\n - length (int): Desired length of the generated string.\n - predicates (list of strings): Conditions to evaluate the string.\n Must contain options from 'has_uppercase', 'has_lowercase', 'has_special_chars', 'has_numbers'.\n - seed (int, optional): Seed for the random number generator for reproducibility.\n\n Returns:\n - tuple:\n - string: the generated random text\n - dict: the text's characteristics\n\n Raises:\n - ValueError: If the specified length is negative.\n - KeyError: If any predicate is not recognized.\n\n Notes:\n - Predicates are deduplicated.\n - Characters are randomly sampled from string ascii_letters, digits, and punctuation with replacement.\n - Any invalid predicates provided will result in a KeyError.\n - If no predicates are provided, the result dictionary will be empty.\n\n Requirements:\n - string\n - random\n\n Example:\n >>> f_832(10, ['has_uppercase', 'has_numbers'], seed=42)\n ('8czu(\"@iNc', {'has_uppercase': True, 'has_numbers': True})\n >>> f_832(5, ['has_lowercase'], seed=123)\n ('eiMk[', {'has_lowercase': True})\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n if length < 0:\n raise ValueError(\"Length must be non-negative.\")\n\n predicate_functions = {\n \"has_uppercase\": lambda x: any(c.isupper() for c in x),\n \"has_lowercase\": lambda x: any(c.islower() for c in x),\n \"has_special_chars\": lambda x: any(c in string.punctuation for c in x),\n \"has_numbers\": lambda x: any(c.isdigit() for c in x),\n }\n\n predicates = list(set(predicates))\n if any(p not in predicate_functions for p in predicates):\n raise KeyError(f\"Invalid predicate provided.\")\n\n characters = string.ascii_letters + string.digits + string.punctuation\n generated_string = \"\".join(random.choices(characters, k=length))\n\n results = {\n predicate: predicate_functions[predicate](generated_string)\n for predicate in predicates\n }\n\n return generated_string, results", "test": "import unittest\nimport string\nclass TestCases(unittest.TestCase):\n def test_valid_length_and_predicates(self):\n result_str, result_dict = f_832(\n 10,\n [\"has_uppercase\", \"has_lowercase\", \"has_numbers\", \"has_special_chars\"],\n seed=1,\n )\n self.assertEqual(len(result_str), 10)\n self.assertTrue(result_dict[\"has_uppercase\"])\n self.assertTrue(result_dict[\"has_lowercase\"])\n self.assertTrue(result_dict[\"has_numbers\"])\n self.assertTrue(result_dict[\"has_special_chars\"])\n def test_result_correctness(self):\n n_repetitions = 1000\n for _ in range(n_repetitions):\n result_str, result_dict = f_832(\n 10,\n [\"has_uppercase\", \"has_lowercase\", \"has_numbers\", \"has_special_chars\"],\n seed=1,\n )\n if any(c.isupper() for c in result_str):\n self.assertTrue(result_dict[\"has_uppercase\"])\n if any(c.islower() for c in result_str):\n self.assertTrue(result_dict[\"has_lowercase\"])\n if any(c in string.punctuation for c in result_str):\n self.assertTrue(result_dict[\"has_special_chars\"])\n if any(c.isdigit() for c in result_str):\n self.assertTrue(result_dict[\"has_numbers\"])\n def test_empty_string(self):\n result_str, result_dict = f_832(0, [\"has_uppercase\", \"has_numbers\"], seed=3)\n self.assertEqual(result_str, \"\")\n self.assertFalse(result_dict[\"has_uppercase\"])\n self.assertFalse(result_dict[\"has_numbers\"])\n def test_negative_length(self):\n with self.assertRaises(ValueError):\n f_832(-1, [\"has_uppercase\"])\n def test_no_predicates(self):\n result_str, result_dict = f_832(10, [], seed=5)\n self.assertEqual(len(result_str), 10)\n self.assertEqual(result_dict, {})\n def test_key_error(self):\n with self.assertRaises(KeyError):\n f_832(10, [\"has_uppercase\", \"invalid\"])\n def test_deduplicate_predicates(self):\n _, result_dict = f_832(15, [\"has_uppercase\", \"has_uppercase\"], seed=7)\n self.assertEqual(len(result_dict), 1)\n def test_random_seed_reproducibility(self):\n result_str1, result_dict1 = f_832(10, [\"has_uppercase\", \"has_numbers\"], seed=8)\n result_str2, result_dict2 = f_832(10, [\"has_uppercase\", \"has_numbers\"], seed=8)\n self.assertEqual(result_str1, result_str2)\n self.assertEqual(result_dict1, result_dict2)", "apis": ["string.punctuation", "random.seed", "string.digits", "string.ascii_letters", "random.choices"], "libs": ["string", "random"], "doc": {"description": ["Generates a random string of specified length and evaluates it for specific characteristics.", "Notes:", "- Predicates are deduplicated.", "- Characters are randomly sampled from string ascii_letters, digits, and punctuation with replacement.", "- Any invalid predicates provided will result in a KeyError.", "- If no predicates are provided, the result dictionary will be empty."], "note": [], "params": ["length (int): Desired length of the generated string.", "predicates (list of strings): Conditions to evaluate the string.", "Must contain options from 'has_uppercase', 'has_lowercase', 'has_special_chars', 'has_numbers'.", "seed (int, optional): Seed for the random number generator for reproducibility."], "returns": ["tuple:", "string: the generated random text", "dict: the text's characteristics"], "reqs": ["string", "random"], "raises": ["ValueError: If the specified length is negative.", "KeyError: If any predicate is not recognized."], "example": [">>> f_832(10, ['has_uppercase', 'has_numbers'], seed=42)", "('8czu(\"@iNc', {'has_uppercase': True, 'has_numbers': True})", ">>> f_832(5, ['has_lowercase'], seed=123)", "('eiMk[', {'has_lowercase': True})"]}} -{"task_id": "f_395", "prompt": "from datetime import datetime, timedelta\nimport pandas as pd\nimport random\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_395(days_in_past=7, random_seed=0):\n \"\"\"\n Generates a graph of daily activity durations for a specified number of days in the past\n using randomly generated data for activities.\n\n This function randomly generates acitivity durations from 0 to 120 for each activity\n from [\"Running\", \"Swimming\", \"Cycling\", \"Yoga\", \"Weight Training\"].\n\n Parameters:\n days_in_past (int, optional): The number of days in the past for which to generate the graph.\n Defaults to 7 days. Must be in the past.\n random_seed (int, optional): Seed for random number generation to ensure reproducibility.\n Defaults to 0.\n\n Returns:\n Tuple containing\n - ax (plt.Axes): DataFrame used for plotting.\n - df (pd.DataFrame): Seaborn lineplot with date on the x-axis, duration on the y-axis, and activity as hue.\n\n Requirements:\n - datetime.datetime\n - datetime.timedelta\n - pandas\n - random\n - seaborn\n\n Example:\n >>> ax, df = f_395(7, random_seed=42)\n >>> type(ax)\n \n\n A sample row from the returned DataFrame might look like:\n Date Activity Duration\n YYYY-MM-DD Running 45\n \"\"\"", "canonical_solution": "\n random.seed(random_seed)\n\n if days_in_past < 1:\n raise ValueError(\"days_in_past must be in the past\")\n\n ACTIVITIES = [\"Running\", \"Swimming\", \"Cycling\", \"Yoga\", \"Weight Training\"]\n\n data = []\n for i in range(days_in_past):\n date = datetime.now().date() - timedelta(days=i)\n for activity in ACTIVITIES:\n duration = random.randint(0, 120)\n data.append([date, activity, duration])\n\n df = pd.DataFrame(data, columns=[\"Date\", \"Activity\", \"Duration\"])\n ax = sns.lineplot(data=df, x=\"Date\", y=\"Duration\", hue=\"Activity\")\n return ax, df", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.default_days_in_past = 7\n self.default_activities = [\n \"Running\",\n \"Swimming\",\n \"Cycling\",\n \"Yoga\",\n \"Weight Training\",\n ]\n def _check_df(self, df, days_in_past):\n self.assertEqual(set(df.columns), {\"Duration\", \"Activity\", \"Date\"})\n self.assertTrue((df[\"Duration\"] >= 0).all() and (df[\"Duration\"] <= 120).all())\n self.assertEqual(len(df[\"Date\"].unique()), days_in_past)\n def _check_plot(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n legend_labels = [t.get_text() for t in ax.get_legend().get_texts()]\n for activity in self.default_activities:\n self.assertIn(activity, legend_labels)\n def test_case_1(self):\n # Test using default parameters\n ax, df = f_395()\n self._check_df(df, self.default_days_in_past)\n self._check_plot(ax)\n def test_case_2(self):\n # Test using custom parameters\n ax, df = f_395(10, random_seed=2)\n self._check_df(df, 10)\n self._check_plot(ax)\n def test_case_3(self):\n # Test days_in_past\n for ndays in [1, 5, 10, 100, 500]:\n _, df = f_395(ndays)\n self.assertEqual(len(df[\"Date\"].unique()), ndays)\n def test_case_4(self):\n # Test random seed\n _, df1 = f_395(10, random_seed=4)\n _, df2 = f_395(10, random_seed=4)\n _, df3 = f_395(10, random_seed=0)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertFalse(df2.equals(df3))\n def test_case_5(self):\n # Test handling invalid days in past\n with self.assertRaises(ValueError):\n f_395(0, random_seed=5)\n with self.assertRaises(ValueError):\n f_395(-1, random_seed=5)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["random.randint", "seaborn.lineplot", "random.seed", "datetime.datetime.now", "datetime.timedelta", "pandas.DataFrame"], "libs": ["pandas", "random", "datetime", "seaborn"], "doc": {"description": ["Generates a graph of daily activity durations for a specified number of days in the past", "using randomly generated data for activities.", "This function randomly generates acitivity durations from 0 to 120 for each activity", "from [\"Running\", \"Swimming\", \"Cycling\", \"Yoga\", \"Weight Training\"].", "A sample row from the returned DataFrame might look like:", "Date Activity Duration", "YYYY-MM-DD Running 45"], "note": [], "params": ["days_in_past (int, optional): The number of days in the past for which to generate the graph.", "Defaults to 7 days. Must be in the past.", "random_seed (int, optional): Seed for random number generation to ensure reproducibility.", "Defaults to 0."], "returns": ["Tuple containing", "ax (plt.Axes): DataFrame used for plotting.", "df (pd.DataFrame): Seaborn lineplot with date on the x-axis, duration on the y-axis, and activity as hue."], "reqs": ["datetime.datetime", "datetime.timedelta", "pandas", "random", "seaborn"], "raises": [], "example": [">>> ax, df = f_395(7, random_seed=42)", ">>> type(ax)", ""]}} -{"task_id": "f_831", "prompt": "import os\nimport re\nfrom pathlib import Path\n\n\ndef f_831(dir_path: str, predicates: list) -> dict:\n \"\"\"\n Evaluates each item (files and directories) in a given directory against specified conditions.\n\n Parameters:\n - dir_path (str): The path to the directory to be evaluated. Must exist.\n - predicates (list of strings): Names of conditions to check for.\n Must contain valid conditions. Invalid conditions are ignored.\n Supported conditions:\n 1. 'is_file': whether the item is a file\n 2. 'is_dir': whether the item is a directory\n 3. 'has_special_chars': whether the item name contains a character that\n is not a letter, digit, or underscore, ignoring file extensions\n 4. 'has_numbers': whether the item name contains a number\n\n Returns:\n - dict: A dictionary with directory items as keys and the results of condition checks as values.\n\n Raises:\n - ValueError: If no valid predicates are provided.\n - FileNotFoundError: If the specified directory does not exist or is not a directory.\n\n Note:\n - This function evaluates file/directory names, rather than their full path.\n - Predicates are deduplicated.\n\n Requirements:\n - os\n - re\n - pathlib\n\n Examples:\n >>> f_831('/path/to/dir', ['is_file', 'has_numbers'])\n {'file.txt': {'is_file': True, 'has_numbers': False}, 'file2.txt': {'is_file': True, 'has_numbers': True}}\n >>> f_831('/path/to/dir', ['is_dir', 'has_special_chars'])\n {'my_folder': {'is_dir': True, 'has_special_chars': False}, 'a_@Folder': {'is_dir': True, 'has_special_chars': True}}\n \"\"\"", "canonical_solution": " predicate_functions = {\n \"is_file\": lambda x: x.is_file(),\n \"is_dir\": lambda x: x.is_dir(),\n \"has_special_chars\": lambda x: bool(re.search(r\"\\W\", x.stem)),\n \"has_numbers\": lambda x: bool(re.search(r\"\\d\", x.name)),\n }\n predicates = [p for p in set(predicates) if p in predicate_functions]\n if not predicates:\n raise ValueError(\"No valid predicates provided.\")\n\n if not os.path.exists(dir_path) or not os.path.isdir(dir_path):\n raise FileNotFoundError(\n f\"The directory {dir_path} does not exist or is not a directory.\"\n )\n\n results = {}\n for item in os.listdir(dir_path):\n full_path = Path(os.path.join(dir_path, item))\n results[item] = {\n predicate_name: predicate_fn(full_path)\n for predicate_name, predicate_fn in predicate_functions.items()\n if predicate_name in predicates\n }\n return results", "test": "import unittest\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = TemporaryDirectory()\n self.test_dir = self.temp_dir.name\n self.fields = [\n \"is_file\",\n \"is_dir\",\n \"has_special_chars\",\n \"has_numbers\",\n ]\n self.is_file_fns = [\n \"file\",\n \"file.txt\",\n \"file1.txt\",\n \"somefile\",\n ]\n self.is_dir_fns = [\"somedir\", \"aDirectory123\"]\n def tearDown(self):\n self.temp_dir.cleanup()\n def helper_make_data(self, name, is_dir=False):\n # Helper function to make test files\n if is_dir:\n Path(os.path.join(self.test_dir, name)).mkdir()\n else:\n Path(os.path.join(self.test_dir, name)).touch()\n def helper_assert_predicate(self, results, predicates):\n # Helper to check only specified predicates are returned\n num_predicates = len(predicates)\n self.assertTrue(all(len(r) == num_predicates for r in results.values()))\n self.assertTrue(\n all(predicate in r for r in results.values() for predicate in predicates)\n )\n def test_file_is_file(self):\n field = \"is_file\"\n for fn in self.is_file_fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in self.is_file_fns:\n self.assertTrue(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_file_is_not_dir(self):\n field = \"is_dir\"\n for fn in self.is_file_fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in self.is_file_fns:\n self.assertFalse(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_dir_is_dir(self):\n field = \"is_dir\"\n for fn in self.is_dir_fns:\n self.helper_make_data(fn, is_dir=True)\n result = f_831(str(self.test_dir), [field])\n for fn in self.is_dir_fns:\n self.assertTrue(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_dir_is_not_file(self):\n field = \"is_file\"\n for fn in self.is_dir_fns:\n self.helper_make_data(fn, is_dir=True)\n result = f_831(str(self.test_dir), [field])\n for fn in self.is_dir_fns:\n self.assertFalse(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_has_special_char(self):\n field = \"has_special_chars\"\n fns = [\"fi!e\", \"fi@\", \"f.ile.txt\"]\n for fn in fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in fns:\n self.assertTrue(result[fn][field], result)\n self.helper_assert_predicate(result, [field])\n def test_has_no_special_char(self):\n field = \"has_special_chars\"\n fns = [\"file_\", \"_file\", \"file.txt\", \"some_file.txt\"]\n for fn in fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in fns:\n self.assertFalse(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_has_numbers(self):\n field = \"has_numbers\"\n fns = [\"123\", \"123.txt\", \"text123\", \"t1e2x3t4\"]\n for fn in fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in fns:\n self.assertTrue(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_multiple_predicates(self):\n fn = \"test1!.txt\"\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), self.fields)\n self.helper_assert_predicate(result, self.fields)\n self.assertTrue(result[fn][\"is_file\"])\n self.assertFalse(result[fn][\"is_dir\"])\n self.assertTrue(result[fn][\"has_special_chars\"])\n self.assertTrue(result[fn][\"has_numbers\"])\n def test_deduplicate_predicates(self):\n fn = \"test_file\"\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [\"is_file\", \"is_file\"])\n self.assertTrue(len(result) == 1)\n self.helper_assert_predicate(result, [\"is_file\"])\n def test_empty_predicates(self):\n with self.assertRaises(ValueError):\n f_831(str(self.test_dir), [])\n def test_invalid_predicates(self):\n with self.assertRaises(ValueError):\n f_831(str(self.test_dir), [\"foo\", \"bar\"])\n def test_nonexistent_directory_error(self):\n with self.assertRaises(FileNotFoundError):\n f_831(\"nonexistent_dir\", [\"is_file\"])", "apis": ["re.search", "os.path.exists", "os.listdir", "os.path.isdir", "os.path", "os.path.join", "pathlib.Path"], "libs": ["re", "os", "pathlib"], "doc": {"description": ["Evaluates each item (files and directories) in a given directory against specified conditions."], "note": ["This function evaluates file/directory names, rather than their full path.", "Predicates are deduplicated."], "params": ["dir_path (str): The path to the directory to be evaluated. Must exist.", "predicates (list of strings): Names of conditions to check for.", "Must contain valid conditions. Invalid conditions are ignored.", "Supported conditions:", "1. 'is_file': whether the item is a file", "2. 'is_dir': whether the item is a directory", "3. 'has_special_chars': whether the item name contains a character that", "is not a letter, digit, or underscore, ignoring file extensions", "4. 'has_numbers': whether the item name contains a number"], "returns": ["dict: A dictionary with directory items as keys and the results of condition checks as values."], "reqs": ["os", "re", "pathlib"], "raises": ["ValueError: If no valid predicates are provided.", "FileNotFoundError: If the specified directory does not exist or is not a directory."], "example": ["Examples:", ">>> f_831('/path/to/dir', ['is_file', 'has_numbers'])", "{'file.txt': {'is_file': True, 'has_numbers': False}, 'file2.txt': {'is_file': True, 'has_numbers': True}}", ">>> f_831('/path/to/dir', ['is_dir', 'has_special_chars'])", "{'my_folder': {'is_dir': True, 'has_special_chars': False}, 'a_@Folder': {'is_dir': True, 'has_special_chars': True}}"]}} -{"task_id": "f_921", "prompt": "from datetime import datetime\nimport pytz\nimport numpy as np\n\n\ndef f_921(time_strings, timezone):\n \"\"\"\n Calculates the average time difference in seconds between each consecutive pair of timestamps\n in a given list, after converting them to a specified timezone.\n\n Parameters:\n - time_strings (list of str): A list of timestamp strings in the format 'dd/mm/yy HH:MM:SS.fff'.\n - timezone (str): The timezone to which the timestamp strings should be converted.\n This should be a valid timezone string, e.g., 'America/New_York'.\n\n Returns:\n - float: The mean (average) time difference in seconds between each consecutive pair of timestamps.\n If there are less than two timestamps in the list, the function returns 0.0.\n\n Requirements:\n - datetime\n - pytz\n - numpy\n\n Notes:\n - The function first converts each timestamp in the list to the specified timezone.\n - It then calculates the absolute time difference in seconds between each consecutive pair of timestamps.\n - If the list contains less than two timestamps, the function returns 0.0, as there are no pairs to compare.\n - If there are no time differences (e.g., in case of a single timestamp after timezone conversion), it also returns 0.0.\n - The function uses numpy's mean function to calculate the average time difference.\n\n Example:\n >>> time_strings = ['30/03/09 16:31:32.123', '30/03/09 16:32:33.123', '30/03/09 16:33:34.123']\n >>> mean_diff = f_921(time_strings, 'America/New_York')\n >>> print(mean_diff)\n 61.0\n \"\"\"", "canonical_solution": " if len(time_strings) < 2:\n return 0.0\n\n time_zone = pytz.timezone(timezone)\n parsed_times = [\n datetime.strptime(ts, \"%d/%m/%y %H:%M:%S.%f\")\n .replace(tzinfo=pytz.UTC)\n .astimezone(time_zone)\n for ts in time_strings\n ]\n\n differences = [\n abs((t2 - t1).total_seconds()) for t1, t2 in zip(parsed_times, parsed_times[1:])\n ]\n\n return np.mean(differences) if differences else 0.0", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_921\"\"\"\n def test_example_case(self):\n \"\"\"Test the example case.\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:32:33.123\",\n \"30/03/09 16:33:34.123\",\n ]\n self.assertAlmostEqual(f_921(time_strings, \"America/New_York\"), 61.0)\n def test_different_timezones(self):\n \"\"\"Test different timezones.\"\"\"\n time_strings = [\n \"01/04/21 12:00:00.000\",\n \"01/04/21 12:01:01.000\",\n \"01/04/21 12:02:02.000\",\n ]\n self.assertAlmostEqual(f_921(time_strings, \"Asia/Tokyo\"), 61.0)\n self.assertAlmostEqual(f_921(time_strings, \"Europe/London\"), 61.0)\n def test_varying_differences(self):\n \"\"\"Test varying differences.\"\"\"\n time_strings = [\n \"01/04/21 12:00:00.000\",\n \"01/04/21 12:01:01.000\",\n \"01/04/21 12:03:03.000\",\n ]\n self.assertAlmostEqual(f_921(time_strings, \"Asia/Tokyo\"), 91.5)\n def test_single_time_string(self):\n \"\"\"Test single time string.\"\"\"\n time_strings = [\"01/04/21 12:00:00.000\"]\n self.assertEqual(f_921(time_strings, \"Asia/Tokyo\"), 0.0)\n def test_span_across_days(self):\n \"\"\"Test span across days.\"\"\"\n time_strings = [\"31/03/21 23:59:00.000\", \"01/04/21 00:01:00.000\"]\n self.assertAlmostEqual(f_921(time_strings, \"Asia/Tokyo\"), 120.0)\n def test_out_of_order_strings(self):\n \"\"\"Test out of order strings.\"\"\"\n time_strings = [\n \"01/04/21 12:02:02.000\",\n \"01/04/21 12:00:00.000\",\n \"01/04/21 12:01:01.000\",\n ]\n self.assertAlmostEqual(f_921(time_strings, \"Asia/Tokyo\"), 91.5)", "apis": ["numpy.mean", "pytz.UTC", "datetime.datetime.strptime", "pytz.timezone"], "libs": ["numpy", "datetime", "pytz"], "doc": {"description": ["Calculates the average time difference in seconds between each consecutive pair of timestamps", "in a given list, after converting them to a specified timezone.", "Notes:", "- The function first converts each timestamp in the list to the specified timezone.", "- It then calculates the absolute time difference in seconds between each consecutive pair of timestamps.", "- If the list contains less than two timestamps, the function returns 0.0, as there are no pairs to compare.", "- If there are no time differences (e.g., in case of a single timestamp after timezone conversion), it also returns 0.0.", "- The function uses numpy's mean function to calculate the average time difference."], "note": [], "params": ["time_strings (list of str): A list of timestamp strings in the format 'dd/mm/yy HH:MM:SS.fff'.", "timezone (str): The timezone to which the timestamp strings should be converted.", "This should be a valid timezone string, e.g., 'America/New_York'."], "returns": ["float: The mean (average) time difference in seconds between each consecutive pair of timestamps.", "If there are less than two timestamps in the list, the function returns 0.0."], "reqs": ["datetime", "pytz", "numpy"], "raises": [], "example": [">>> time_strings = ['30/03/09 16:31:32.123', '30/03/09 16:32:33.123', '30/03/09 16:33:34.123']", ">>> mean_diff = f_921(time_strings, 'America/New_York')", ">>> print(mean_diff)", "61.0"]}} -{"task_id": "f_913", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_913(data_dict):\n \"\"\"\n Generates histograms for each column in the given DataFrame and checks if the value distributions\n are uniform. It prints a message for each non-uniform distribution.\n\n Parameters:\n df (pd.DataFrame): The DataFrame to be analyzed.\n\n Returns:\n List[plt.Axes]: A list of matplotlib Axes objects, each representing the histogram for a column.\n \n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> data = {'Category1': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'E', 'E'],\n ... 'Category2': ['X', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'W', 'W', 'W', 'W', 'W']}\n >>> axes = f_913(data)\n The distribution of values in column 'Category1' is not uniform.\n The distribution of values in column 'Category2' is not uniform.\n >>> [ax.get_title() for ax in axes]\n ['Category1', 'Category2']\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data_dict)\n axes_list = []\n for column in df.columns:\n counts = df[column].value_counts()\n uniform = (\n len(set(counts)) == 1\n ) # Check if all counts are the same (uniform distribution)\n\n if not uniform:\n print(f\"The distribution of values in column '{column}' is not uniform.\")\n\n ax = counts.plot(kind=\"bar\")\n ax.set_title(column)\n axes_list.append(ax)\n plt.close()\n\n return axes_list", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_913 function.\"\"\"\n def test_uniform_distribution(self):\n \"\"\"Test for uniform distribution.\"\"\"\n data = {\n \"Category1\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"],\n \"Category2\": [\"X\", \"X\", \"Y\", \"Y\", \"Z\", \"Z\"],\n }\n axes = f_913(data)\n self.assertEqual([ax.get_title() for ax in axes], [\"Category1\", \"Category2\"])\n def test_non_uniform_distribution(self):\n \"\"\"Test for non-uniform distribution.\"\"\"\n data = {\n \"Category1\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\", \"C\"],\n \"Category2\": [\"X\", \"X\", \"Y\", \"Y\", \"Z\", \"Z\", \"Z\"],\n }\n axes = f_913(data)\n self.assertEqual([ax.get_title() for ax in axes], [\"Category1\", \"Category2\"])\n def test_single_column(self):\n \"\"\"Test for single column.\"\"\"\n data = {\n \"Category1\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"],\n }\n axes = f_913(data)\n self.assertEqual([ax.get_title() for ax in axes], [\"Category1\"])\n def test_multiple_categories(self):\n \"\"\"Test for multiple categories.\"\"\"\n data = {\n \"Category1\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\", \"D\", \"D\", \"E\", \"E\"],\n \"Category2\": [\"X\", \"X\", \"Y\", \"Y\", \"Z\", \"Z\", \"W\", \"W\", \"V\", \"V\"],\n }\n axes = f_913(data)\n self.assertEqual([ax.get_title() for ax in axes], [\"Category1\", \"Category2\"])\n def test_empty_dataframe(self):\n \"\"\"Test for empty dataframe.\"\"\"\n data = {}\n axes = f_913(data)\n self.assertEqual(axes, [])", "apis": ["pandas.DataFrame", "matplotlib.pyplot.close"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Generates histograms for each column in the given DataFrame and checks if the value distributions", "are uniform. It prints a message for each non-uniform distribution."], "note": [], "params": ["df (pd.DataFrame): The DataFrame to be analyzed."], "returns": ["List[plt.Axes]: A list of matplotlib Axes objects, each representing the histogram for a column."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> data = {'Category1': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'E', 'E'],", "... 'Category2': ['X', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'W', 'W', 'W', 'W', 'W']}", ">>> axes = f_913(data)", "The distribution of values in column 'Category1' is not uniform.", "The distribution of values in column 'Category2' is not uniform.", ">>> [ax.get_title() for ax in axes]", "['Category1', 'Category2']"]}} -{"task_id": "f_419", "prompt": "from collections import Counter\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\n\n\ndef f_419(df, n_clusters=3, random_state=None, n_init=10):\n \"\"\"\n Identify duplicate points in a DataFrame, perform KMeans clustering on the unique points,\n and record the clusters.\n\n Parameters:\n df (pd.DataFrame): A DataFrame containing at least two columns 'x' and 'y' representing points.\n n_clusters (int, optional): Number of clusters for KMeans clustering. Default is 3.\n random_state (int, optional): The seed used by the random number generator for reproducibility. Default is None.\n n_init (int, optional): Number of time the k-means algorithm will be run with different centroid seeds.\n The final results will be the best output of n_init consecutive runs in terms of\n within-cluster sum of squares. Default is 10.\n\n Returns:\n tuple: A tuple containing:\n - Counter: A Counter object with the count of duplicate points.\n - pd.DataFrame: A DataFrame with an additional column 'cluster' representing cluster assignments for unique points.\n - Axes: A scatter plot of the clustered data.\n\n Requirements:\n - collections.Counter\n - sklearn.cluster.KMeans\n - matplotlib.pyplot\n\n Example:\n >>> df = pd.DataFrame({\\\n 'x': [1, 2, 2, 2, 3, 4],\\\n 'y': [1, 1, 1, 1, 3, 3]\\\n })\n >>> duplicates, df_clustered, ax = f_419(df, random_state=42)\n >>> df_clustered\n x y cluster\n 0 1 1 2\n 1 2 1 0\n 4 3 3 1\n 5 4 3 1\n >>> duplicates\n Counter({(2, 1): 3})\n \"\"\"", "canonical_solution": " # Identify duplicates\n duplicates = df[df.duplicated(subset=[\"x\", \"y\"], keep=False)]\n duplicates_counter = Counter(map(tuple, duplicates[[\"x\", \"y\"]].values))\n\n # Remove duplicates and perform KMeans clustering on unique points\n unique_df = df.drop_duplicates(subset=[\"x\", \"y\"]).copy()\n\n # Adjust n_clusters if unique data points are fewer than desired clusters\n n_clusters = min(n_clusters, len(unique_df))\n\n kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=n_init)\n unique_df[\"cluster\"] = kmeans.fit_predict(unique_df[[\"x\", \"y\"]])\n\n # Plot clustered data\n fig, ax = plt.subplots()\n scatter = ax.scatter(unique_df[\"x\"], unique_df[\"y\"], c=unique_df[\"cluster\"])\n ax.set_xlabel(\"x\")\n ax.set_ylabel(\"y\")\n ax.set_title(\"KMeans Clusters\")\n\n return duplicates_counter, unique_df, ax", "test": "import unittest\nimport pandas as pd\nfrom collections import Counter\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic functionality with duplicates\n df = pd.DataFrame({\"x\": [1, 2, 2, 2, 3, 4], \"y\": [1, 1, 1, 1, 3, 3]})\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter({(2, 1): 3}))\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n self.assertFalse(df_clustered[\"cluster\"].isna().any())\n def test_case_2(self):\n # Test functionality without duplicates\n df = pd.DataFrame({\"x\": [1, 2, 3, 4, 5, 6], \"y\": [1, 2, 3, 4, 5, 6]})\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter())\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n def test_case_3(self):\n # Test functionality with all points being duplicates\n df = pd.DataFrame({\"x\": [1, 1, 1, 1, 1, 1], \"y\": [1, 1, 1, 1, 1, 1]})\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter({(1, 1): 6}))\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n def test_case_4(self):\n # Test with specified number of clusters\n df = pd.DataFrame({\"x\": [1, 2, 3, 40, 50, 60], \"y\": [1, 2, 3, 40, 50, 60]})\n duplicates, df_clustered, ax = f_419(df, n_clusters=2, random_state=42)\n self.assertEqual(duplicates, Counter())\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n def test_case_5(self):\n # Test functionality with multiple duplicates\n df = pd.DataFrame(\n {\"x\": [1, 2, 3, 4, 5, 5, 5, 5], \"y\": [1, 2, 3, 4, 5, 5, 5, 5]}\n )\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter({(5, 5): 4}))\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n self.assertFalse(df_clustered[\"cluster\"].isna().any())\n def test_case_6(self):\n # Test with a mix of unique points and duplicates\n df = pd.DataFrame(\n {\"x\": [1, 2, 3, 3, 3, 4, 5, 6], \"y\": [1, 2, 3, 3, 3, 4, 5, 6]}\n )\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter({(3, 3): 3}))\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n self.assertFalse(df_clustered[\"cluster\"].isna().any())\n def test_case_7(self):\n # Easily separable data\n df = pd.DataFrame(\n {\n \"x\": [1, 2, 3, 10, 11, 12, 20, 21, 22],\n \"y\": [1, 2, 3, 10, 11, 12, 20, 21, 22],\n }\n )\n # We expect 3 clusters because of the natural separation in data\n duplicates, df_clustered, _ = f_419(df, n_clusters=3, random_state=42)\n self.assertEqual(duplicates, Counter())\n # Check that all points in a specific region belong to the same cluster\n cluster_1 = df_clustered[df_clustered[\"x\"] <= 3][\"cluster\"].nunique()\n cluster_2 = df_clustered[(df_clustered[\"x\"] > 3) & (df_clustered[\"x\"] <= 12)][\n \"cluster\"\n ].nunique()\n cluster_3 = df_clustered[df_clustered[\"x\"] > 12][\"cluster\"].nunique()\n self.assertEqual(\n cluster_1, 1\n ) # All points in this region should belong to the same cluster\n self.assertEqual(\n cluster_2, 1\n ) # All points in this region should belong to the same cluster\n self.assertEqual(\n cluster_3, 1\n ) # All points in this region should belong to the same cluster\n def test_case_8(self):\n # Test effects of random state on clustering outcome\n df = pd.DataFrame(\n {\"x\": [10, 20, 20, 40, 50, 60], \"y\": [10, 20, 20, 40, 50, 60]}\n )\n _, df_clustered_1, _ = f_419(df, n_clusters=2, random_state=42)\n _, df_clustered_2, _ = f_419(df, n_clusters=2, random_state=42)\n # Clusters should be the same for the same random state\n self.assertTrue((df_clustered_1[\"cluster\"] == df_clustered_2[\"cluster\"]).all())\n def tearDown(self):\n plt.close(\"all\")", "apis": ["sklearn.cluster.KMeans", "collections.Counter", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "collections", "sklearn"], "doc": {"description": ["Identify duplicate points in a DataFrame, perform KMeans clustering on the unique points,", "and record the clusters."], "note": [], "params": ["df (pd.DataFrame): A DataFrame containing at least two columns 'x' and 'y' representing points.", "n_clusters (int, optional): Number of clusters for KMeans clustering. Default is 3.", "random_state (int, optional): The seed used by the random number generator for reproducibility. Default is None.", "n_init (int, optional): Number of time the k-means algorithm will be run with different centroid seeds.", "The final results will be the best output of n_init consecutive runs in terms of", "within-cluster sum of squares. Default is 10."], "returns": ["tuple: A tuple containing:", "Counter: A Counter object with the count of duplicate points.", "pd.DataFrame: A DataFrame with an additional column 'cluster' representing cluster assignments for unique points.", "Axes: A scatter plot of the clustered data."], "reqs": ["collections.Counter", "sklearn.cluster.KMeans", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({\\", "'x': [1, 2, 2, 2, 3, 4],\\", "'y': [1, 1, 1, 1, 3, 3]\\", "})", ">>> duplicates, df_clustered, ax = f_419(df, random_state=42)", ">>> df_clustered", "x y cluster", "0 1 1 2", "1 2 1 0", "4 3 3 1", "5 4 3 1", ">>> duplicates", "Counter({(2, 1): 3})"]}} -{"task_id": "f_900", "prompt": "import numpy as np\nimport random\nimport matplotlib.pyplot as plt\n\n# Constants\nLETTERS = list(\"abcdefghijklmnopqrstuvwxyz\")\nNUMBERS = list(range(1, 27))\n\n\ndef f_900(n_pairs=26):\n \"\"\"\n This function generates and displays a bar chart representing random letter-number pairs.\n Each bar corresponds to a unique pair, formed by combining a letter from 'a' to 'z' with a number\n from 1 to 26. The function randomly shuffles these pairs and assigns a random count to each.\n\n Parameters:\n - n_pairs (int, optional): The number of letter-number pairs to display in the bar chart.\n The value must be an integer between 1 and 26, inclusive. The default value is 26, which\n includes one pair for each letter in the alphabet.\n\n Returns:\n - matplotlib.container.BarContainer: This object represents the bar chart created by the function.\n Each bar in the chart is labeled with its corresponding letter-number pair (e.g., 'a:1', 'b:2').\n\n Raises:\n - ValueError: If 'n_pairs' is outside the range of 1 to 26, inclusive. This ensures that the function\n operates within the bounds of the predefined letters ('a' to 'z') and numbers (1 to 26).\n\n Requirements:\n - numpy\n - matplotlib\n - random\n\n Notes:\n - Each call to this function will likely produce a different chart because it shuffles the order\n of the pairs and assigns random counts to them.\n - The random counts assigned to each pair range from 1 to 9.\n\n Example:\n >>> ax = f_900(5)\n >>> [bar.get_label() for bar in ax]\n ['d:4', 'b:2', 'c:3', 'e:5', 'a:1']\n \"\"\"", "canonical_solution": " if n_pairs > 26 or n_pairs < 1:\n raise ValueError(\"n_pairs should be between 1 and 26\")\n\n pairs = [f\"{letter}:{number}\" for letter, number in zip(LETTERS, NUMBERS)][:n_pairs]\n random.seed(42)\n random.shuffle(pairs)\n counts = np.random.randint(1, 10, size=n_pairs)\n\n bars = plt.bar(pairs, counts)\n\n # Set label for each bar\n for bar, pair in zip(bars, pairs):\n bar.set_label(pair)\n\n plt.xlabel(\"Letter:Number Pairs\")\n plt.ylabel(\"Counts\")\n plt.title(\"Random Letter:Number Pairs Chart\")\n\n return bars", "test": "import unittest\nimport matplotlib.pyplot as plt\nfrom matplotlib.container import BarContainer\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_900.\"\"\"\n def test_return_type(self):\n \"\"\"Verify the returned type of the function.\"\"\"\n random.seed(0)\n ax = f_900(5)\n self.assertIsInstance(\n ax, BarContainer, \"The returned object is not of the expected type.\"\n )\n def test_number_of_bars(self):\n \"\"\"Verify the number of bars plotted for different `n_pairs` values.\"\"\"\n random.seed(1)\n for i in [5, 10, 20]:\n ax = f_900(i)\n self.assertEqual(\n len(ax.patches),\n i,\n f\"Expected {i} bars, but got {len(ax.patches)} bars.\",\n )\n def test_labels_and_title(self):\n \"\"\"Verify the labels and the title of the plotted bar chart.\"\"\"\n random.seed(2)\n _ = f_900(15)\n fig = plt.gcf()\n axes = fig.gca()\n self.assertEqual(\n axes.get_xlabel(), \"Letter:Number Pairs\", \"X label is incorrect.\"\n )\n self.assertEqual(axes.get_ylabel(), \"Counts\", \"Y label is incorrect.\")\n self.assertEqual(\n axes.get_title(), \"Random Letter:Number Pairs Chart\", \"Title is incorrect.\"\n )\n def test_invalid_n_pairs(self):\n \"\"\"Test the function with invalid `n_pairs` values.\"\"\"\n random.seed(3)\n with self.assertRaises(ValueError):\n f_900(27)\n with self.assertRaises(ValueError):\n f_900(0)\n def test_valid_pairs(self):\n \"\"\"Verify that the pairs generated are valid and correspond to the expected letter:number format.\"\"\"\n random.seed(4)\n ax = f_900(5)\n expected_pairs = [\"a:1\", \"b:2\", \"c:3\", \"d:4\", \"e:5\"]\n generated_pairs = [bar.get_label() for bar in ax]\n for expected_pair in expected_pairs:\n self.assertIn(\n expected_pair,\n generated_pairs,\n f\"Expected pair {expected_pair} not found in plotted pairs.\",\n )", "apis": ["random.shuffle", "numpy.random.randint", "random.seed", "matplotlib.pyplot.xlabel", "numpy.random", "matplotlib.pyplot.title", "matplotlib.pyplot.bar", "matplotlib.pyplot.ylabel"], "libs": ["numpy", "matplotlib", "random"], "doc": {"description": ["This function generates and displays a bar chart representing random letter-number pairs.", "Each bar corresponds to a unique pair, formed by combining a letter from 'a' to 'z' with a number", "from 1 to 26. The function randomly shuffles these pairs and assigns a random count to each.", "Notes:", "- Each call to this function will likely produce a different chart because it shuffles the order", "of the pairs and assigns random counts to them.", "- The random counts assigned to each pair range from 1 to 9."], "note": [], "params": ["n_pairs (int, optional): The number of letter-number pairs to display in the bar chart.", "The value must be an integer between 1 and 26, inclusive. The default value is 26, which", "includes one pair for each letter in the alphabet."], "returns": ["matplotlib.container.BarContainer: This object represents the bar chart created by the function.", "Each bar in the chart is labeled with its corresponding letter-number pair (e.g., 'a:1', 'b:2')."], "reqs": ["numpy", "matplotlib", "random"], "raises": ["ValueError: If 'n_pairs' is outside the range of 1 to 26, inclusive. This ensures that the function", "operates within the bounds of the predefined letters ('a' to 'z') and numbers (1 to 26)."], "example": [">>> ax = f_900(5)", ">>> [bar.get_label() for bar in ax]", "['d:4', 'b:2', 'c:3', 'e:5', 'a:1']"]}} -{"task_id": "f_330", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_330(data, column=\"c\"):\n \"\"\"\n Remove a column from a data dictionary if it exists, and then plot the remaining data\n if it contains numeric data.\n\n Parameters:\n - data (dict): The input data dictionary.\n - column (str): Name of column to remove. Defaults to \"c\".\n\n Returns:\n - df (pd.DataFrame): The modified DataFrame after removing the specified column.\n - ax (matplotlib.axes._axes.Axes or None): The plot of the modified DataFrame if there's\n numeric data to plot, otherwise None.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> data = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}\n >>> modified_df, ax = f_330(data)\n >>> ax\n \n >>> modified_df\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n if column in df.columns:\n df = df.drop(columns=column)\n\n # If there's no numeric data, return None for the plot.\n if df.empty or not np.any(df.dtypes.apply(pd.api.types.is_numeric_dtype)):\n return df, None\n\n ax = df.plot()\n return df, ax", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Scenario: DataFrame with columns 'a', 'b', and 'c'.\n np.random.seed(0)\n data = {\n \"a\": np.random.randn(10),\n \"b\": np.random.randn(10),\n \"c\": np.random.randn(10),\n }\n df = pd.DataFrame(\n data\n )\n modified_df, ax = f_330(data) # Remove default column 'c'.\n # Assert column 'c' removal and plot data verification.\n self.assertNotIn(\"c\", modified_df.columns)\n plotted_data = [line.get_ydata() for line in ax.get_lines()]\n self.assertTrue(\n all(\n [\n np.array_equal(data, modified_df[col].values)\n for data, col in zip(plotted_data, modified_df.columns)\n ]\n )\n )\n def test_case_2(self):\n # Scenario: DataFrame with columns 'a' and 'b' (no 'c').\n np.random.seed(0)\n data = {\"a\": np.random.randn(10), \"b\": np.random.randn(10)}\n df = pd.DataFrame(data)\n modified_df, ax = f_330(data)\n # Assert that the modified DataFrame remains unchanged and plot is generated.\n self.assertEqual(list(df.columns), list(modified_df.columns))\n self.assertIsNotNone(ax)\n def test_case_3(self):\n # Scenario: Empty DataFrame\n data = {}\n df = pd.DataFrame(data)\n modified_df, ax = f_330(data)\n # Assert empty DataFrame and no plot.\n self.assertTrue(modified_df.empty)\n self.assertIsNone(ax)\n def test_case_4(self):\n # Scenario: DataFrame with single non-numeric column 'c'.\n data = {\"c\": [\"apple\", \"banana\", \"cherry\"]}\n df = pd.DataFrame(data)\n modified_df, ax = f_330(data)\n # Assert empty DataFrame after 'c' removal and no plot.\n self.assertTrue(modified_df.empty)\n self.assertIsNone(ax)\n def test_case_5(self):\n np.random.seed(0)\n # Scenario: DataFrame with columns 'a', 'b', 'c', and non-numeric column 'd'.\n data = {\n \"a\": np.random.randn(10),\n \"b\": np.random.randn(10),\n \"c\": np.random.randn(10),\n \"d\": [\n \"apple\",\n \"banana\",\n \"cherry\",\n \"date\",\n \"fig\",\n \"grape\",\n \"honeydew\",\n \"kiwi\",\n \"lime\",\n \"mango\",\n ],\n }\n df = pd.DataFrame(\n data\n )\n modified_df, ax = f_330(data)\n # Assert column 'c' removal and plot data verification excluding non-numeric column 'd'.\n self.assertNotIn(\"c\", modified_df.columns)\n plotted_data = [line.get_ydata() for line in ax.get_lines()]\n self.assertTrue(\n all(\n [\n np.array_equal(data, modified_df[col].values)\n for data, col in zip(plotted_data, modified_df.columns)\n if col != \"d\"\n ]\n )\n )\n def test_case_6(self):\n # Scenario: Remove specified column.\n np.random.seed(0)\n data = {\n \"a\": np.random.randn(10),\n \"b\": np.random.randn(10),\n }\n df = pd.DataFrame(\n data\n )\n modified_df, ax = f_330(df, column=\"a\")\n self.assertNotIn(\"a\", modified_df.columns)\n plotted_data = [line.get_ydata() for line in ax.get_lines()]\n self.assertTrue(\n all(\n [\n np.array_equal(data, modified_df[col].values)\n for data, col in zip(plotted_data, modified_df.columns)\n ]\n )\n )\n def test_case_7(self):\n # Scenario: Only non-numeric columns.\n data = {\n \"a\": [\"apple\", \"banana\"],\n \"b\": [\"cherry\", \"date\"],\n \"c\": [\"fig\", \"grape\"],\n }\n df = pd.DataFrame(\n data\n )\n modified_df, ax = f_330(data)\n self.assertNotIn(\"c\", modified_df.columns)\n pd.testing.assert_frame_equal(df[[\"a\", \"b\"]], modified_df)\n self.assertEqual(ax, None)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.api", "pandas.DataFrame", "numpy.any"], "libs": ["numpy", "pandas"], "doc": {"description": ["Remove a column from a data dictionary if it exists, and then plot the remaining data", "if it contains numeric data."], "note": [], "params": ["data (dict): The input data dictionary.", "column (str): Name of column to remove. Defaults to \"c\"."], "returns": ["df (pd.DataFrame): The modified DataFrame after removing the specified column.", "ax (matplotlib.axes._axes.Axes or None): The plot of the modified DataFrame if there's", "numeric data to plot, otherwise None."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> data = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}", ">>> modified_df, ax = f_330(data)", ">>> ax", "", ">>> modified_df", "a b", "0 1 4", "1 2 5", "2 3 6"]}} -{"task_id": "f_932", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_932(data=None):\n \"\"\"\n Pre-process a dataset by converting it to a Pandas DataFrame,\n replacing values less than 0.5 with zeros, and\n standardizing the data using StandardScaler.\n\n Parameters:\n - data (numpy.ndarray, optional): A numpy array representing the dataset. If not provided, a random dataset\n of shape (100, 5) is generated.\n\n Returns:\n - pandas.DataFrame: The preprocessed dataset. Original values less than 0.5 are replaced with zeros, and the\n entire dataset is standardized.\n\n Requirements:\n - numpy\n - pandas\n - sklearn.preprocessing.StandardScaler\n\n Example:\n >>> np.random.seed(0)\n >>> dataset = np.random.rand(10, 5)\n >>> preprocessed_data = f_932(dataset)\n >>> preprocessed_data.head(2)\n 0 1 2 3 4\n 0 0.175481 1.062315 0.244316 -0.17039 -0.647463\n 1 0.461851 -0.978767 1.052947 1.06408 -0.647463\n \"\"\"", "canonical_solution": " if data is None:\n data = np.random.rand(100, 5)\n\n df = pd.DataFrame(data)\n df[df < 0.5] = 0\n\n scaler = StandardScaler()\n scaled_data = scaler.fit_transform(df)\n standardized_df = pd.DataFrame(scaled_data, columns=df.columns)\n\n return standardized_df", "test": "import numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_932.\"\"\"\n def test_default_dataset(self):\n \"\"\"Test the function with default dataset.\"\"\"\n result = f_932()\n self.assertIsInstance(result, pd.DataFrame)\n self.assertEqual(result.shape, (100, 5))\n def test_small_dataset(self):\n \"\"\"Test the function with a small dataset.\"\"\"\n data = np.array([[0.1, 0.9], [0.4, 0.8]])\n result = f_932(data)\n self.assertEqual(result.shape, (2, 2))\n def test_replacement(self):\n \"\"\"Test the replacement of values less than 0.5.\"\"\"\n data = np.array([[0.1, 0.9], [0.4, 0.8]])\n result = f_932(data)\n self.assertNotIn(0.1, result.values)\n self.assertNotIn(0.4, result.values)\n def test_no_replacement(self):\n \"\"\"Test no replacement for values greater than 0.5.\"\"\"\n data = np.array([[0.6, 0.9], [0.7, 0.8]])\n result = f_932(data)\n self.assertNotIn(0.6, result.values)\n self.assertNotIn(0.7, result.values)\n self.assertNotIn(0.8, result.values)\n self.assertNotIn(0.9, result.values)\n def test_standardization(self):\n \"\"\"Test the standardization of the dataset.\"\"\"\n data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n result = f_932(data)\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.isclose(result.std().mean(), 1.225, atol=0.01))\n \"\"\"Test the replacement of values less than 0.5.\"\"\"\n data = np.array([[0.1, 0.9], [0.4, 0.8]])\n result = f_932(data)\n self.assertNotIn(0.1, result.values)\n self.assertNotIn(0.4, result.values)", "apis": ["numpy.random", "numpy.random.rand", "pandas.DataFrame", "sklearn.preprocessing.StandardScaler"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Pre-process a dataset by converting it to a Pandas DataFrame,", "replacing values less than 0.5 with zeros, and", "standardizing the data using StandardScaler."], "note": [], "params": ["data (numpy.ndarray, optional): A numpy array representing the dataset. If not provided, a random dataset", "of shape (100, 5) is generated."], "returns": ["pandas.DataFrame: The preprocessed dataset. Original values less than 0.5 are replaced with zeros, and the", "entire dataset is standardized."], "reqs": ["numpy", "pandas", "sklearn.preprocessing.StandardScaler"], "raises": [], "example": [">>> np.random.seed(0)", ">>> dataset = np.random.rand(10, 5)", ">>> preprocessed_data = f_932(dataset)", ">>> preprocessed_data.head(2)", "0 1 2 3 4", "0 0.175481 1.062315 0.244316 -0.17039 -0.647463", "1 0.461851 -0.978767 1.052947 1.06408 -0.647463"]}} +{"task_id": "f_338", "prompt": "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom scipy.stats import chi2_contingency\n\n\ndef f_338(df1, df2, column1=\"feature1\", column2=\"feature2\"):\n \"\"\"\n Merge two dataframes based on the 'id' column, perform a chi-square independence test on the merged dataframe,\n and draw a heatmap of the contingency table created from the features in column1, column2.\n\n Parameters:\n - df1 (DataFrame): Left dataframe to merge. Must contain columns 'id' and one matching column1.\n - df2 (DataFrame): Right dataframe to merge from. Must contain columns 'id' and one matching column2.\n - column1 (str): Name of column containing features in df1. Defaults to 'feature1'.\n - column2 (str): Name of column containing features in df2. Defaults to 'feature2'.\n\n Returns:\n tuple: A tuple containing:\n - p (float): The p-value of the Chi-Squared test.\n - heatmap (plt.Axes): Seaborn heatmap of the contingency table.\n\n Requirements:\n - pandas\n - seaborn\n - scipy.stats.chi2_contingency\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': ['A', 'B', 'A']})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature2': ['X', 'Y', 'X']})\n >>> p_value, heatmap = f_338(df1, df2)\n >>> p_value\n 0.6650055421020291\n >>> heatmap\n \n \"\"\"", "canonical_solution": " df = pd.merge(df1, df2, on=\"id\")\n contingency_table = pd.crosstab(df[column1], df[column2])\n heatmap = sns.heatmap(contingency_table)\n chi2, p, dof, expected = chi2_contingency(contingency_table)\n return p, heatmap", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Testing basic functionality with simple data\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [\"A\", \"B\", \"A\"]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"feature2\": [\"X\", \"Y\", \"X\"]})\n p_value, heatmap = f_338(df1, df2)\n # P-value should be between 0 and 1 inclusive\n self.assertTrue(0.0 <= p_value <= 1.0)\n self.assertEqual(len(heatmap.get_yticklabels()), 2) # A and B\n self.assertEqual(len(heatmap.get_xticklabels()), 2) # X and Y\n def test_case_2(self):\n # Testing with distinct feature values across both dataframes\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"feature1\": [\"C\", \"D\", \"C\"]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"feature2\": [\"W\", \"W\", \"Z\"]})\n p_value, heatmap = f_338(df1, df2)\n self.assertTrue(0.0 <= p_value <= 1.0)\n self.assertEqual(len(heatmap.get_yticklabels()), 2) # C and D\n self.assertEqual(len(heatmap.get_xticklabels()), 2) # W and Z\n def test_case_3(self):\n # Test custom feature column names\n df1 = pd.DataFrame({\"id\": [1, 2, 3], \"foo\": [\"A\", \"B\", \"A\"]})\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"bar\": [\"X\", \"Y\", \"X\"]})\n p_value, heatmap = f_338(df1, df2, column1=\"foo\", column2=\"bar\")\n self.assertTrue(0.0 <= p_value <= 1.0)\n self.assertEqual(len(heatmap.get_yticklabels()), 2)\n self.assertEqual(len(heatmap.get_xticklabels()), 2)\n def test_case_4(self):\n # Testing a scenario where the p-value is expected to be close to 0\n # This is because there's a strong association between feature1 and feature2\n df1 = pd.DataFrame(\n {\"id\": list(range(1, 21)), \"feature1\": [\"A\"] * 10 + [\"B\"] * 10}\n )\n df2 = pd.DataFrame(\n {\"id\": list(range(1, 21)), \"feature2\": [\"X\"] * 10 + [\"Y\"] * 10}\n )\n p_value, _ = f_338(df1, df2)\n self.assertTrue(0.0 <= p_value < 0.01) # Expected p-value to be close to 0\n def test_case_5(self):\n # Test error handling - should fail when there is no 'id' column\n df1 = pd.DataFrame({\"foo\": [1, 2], \"bar\": [3, 4]})\n df2 = pd.DataFrame({\"foo\": [1, 2], \"bar\": [3, 4]})\n with self.assertRaises(KeyError):\n f_338(df1, df2)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["seaborn.heatmap", "pandas.crosstab", "pandas.merge", "scipy.stats.chi2_contingency"], "libs": ["seaborn", "pandas", "scipy"], "doc": {"description": ["Merge two dataframes based on the 'id' column, perform a chi-square independence test on the merged dataframe,", "and draw a heatmap of the contingency table created from the features in column1, column2."], "note": [], "params": ["df1 (DataFrame): Left dataframe to merge. Must contain columns 'id' and one matching column1.", "df2 (DataFrame): Right dataframe to merge from. Must contain columns 'id' and one matching column2.", "column1 (str): Name of column containing features in df1. Defaults to 'feature1'.", "column2 (str): Name of column containing features in df2. Defaults to 'feature2'."], "returns": ["tuple: A tuple containing:", "p (float): The p-value of the Chi-Squared test.", "heatmap (plt.Axes): Seaborn heatmap of the contingency table."], "reqs": ["pandas", "seaborn", "scipy.stats.chi2_contingency"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': ['A', 'B', 'A']})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'feature2': ['X', 'Y', 'X']})", ">>> p_value, heatmap = f_338(df1, df2)", ">>> p_value", "0.6650055421020291", ">>> heatmap", ""]}} +{"task_id": "f_871", "prompt": "import subprocess\nimport time\nimport json\nimport platform\n\nLOGFILE_PATH = \"logfile.log\"\n\n\ndef f_871(interval, duration):\n \"\"\"\n Monitors and logs CPU usage at specified intervals over a given duration.\n\n Parameters:\n interval (int): The frequency, in seconds, at which CPU usage data is captured. Must be greater than zero.\n duration (int): The total duration, in seconds, for which CPU usage is monitored. Must be greater than zero.\n\n Returns:\n str: Path to the log file where CPU usage data is saved. Returns None if an IOError occurs during file operations.\n\n Raises:\n ValueError: If either 'interval' or 'duration' is less than or equal to zero.\n\n Requirements:\n - subprocess\n - time\n - json\n - platform\n\n Note: \n Actual run time of the function may slightly exceed the specified 'duration' due to processing time and system response delay.\n The function records the CPU usage percentage at regular intervals for a specified duration.\n The data is captured every 'interval' seconds until the 'duration' is reached or exceeded.\n Each record includes a timestamp and the CPU usage percentage at that moment.\n The data is saved in JSON format in a log file named 'logfile.log'.\n The function supports different commands for CPU usage monitoring on Windows and Unix/Linux platforms.\n \n Example:\n >>> f_871(5, 60)\n 'logfile.log'\n \"\"\"", "canonical_solution": " if interval <= 0 or duration <= 0:\n raise ValueError(\"Interval and duration must be greater than zero.\")\n\n start_time = time.time()\n try:\n with open(LOGFILE_PATH, \"w\", encoding=\"utf-8\") as logfile:\n while time.time() - start_time <= duration:\n operation_start_time = time.time()\n\n # Check the operating system\n if platform.system() == \"Windows\":\n # Windows command for CPU usage\n command = [\n \"typeperf\",\n \"\\\\Processor(_Total)\\\\% Processor Time\",\n \"-sc\",\n \"1\",\n ]\n else:\n # Unix/Linux command for CPU usage\n command = [\"top\", \"-b\", \"-n1\"]\n\n output = subprocess.check_output(command)\n cpu_usage_line = (\n output.decode(\"utf-8\").split(\"\\n\")[2]\n if platform.system() == \"Windows\"\n else output.decode(\"utf-8\").split(\"\\n\")[2]\n )\n cpu_usage = (\n cpu_usage_line.split(\",\")[-1].strip().replace('\"', \"\")\n if platform.system() == \"Windows\"\n else cpu_usage_line.split(\":\")[1].split(\",\")[0].strip()\n )\n\n log_data = {\"timestamp\": time.time(), \"cpu_usage\": cpu_usage}\n json.dump(log_data, logfile)\n logfile.write(\"\\n\")\n\n # Adjust sleep time\n sleep_time = max(0, interval - (time.time() - operation_start_time))\n time.sleep(sleep_time)\n except IOError as e:\n print(f\"Error writing to file {LOGFILE_PATH}: {e}\")\n return None\n\n return LOGFILE_PATH", "test": "import unittest\nimport os\nimport json\nfrom unittest.mock import patch\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_871.\"\"\"\n def setUp(self):\n \"\"\"\n Setup before each test case.\n \"\"\"\n self.logfile_path = \"logfile.log\"\n def tearDown(self):\n \"\"\"\n Cleanup after each test case.\n \"\"\"\n if os.path.exists(self.logfile_path):\n os.remove(self.logfile_path)\n @patch(\"time.time\")\n def test_normal_operation(self, mock_time):\n \"\"\"\n Test the normal operation of the function.\n It should create a log file with the expected content.\n \"\"\"\n # Create an iterator that starts at 0 and increments by 5 every time it's called\n time_iter = iter(range(0, 100, 5))\n mock_time.side_effect = lambda: next(time_iter)\n result = f_871(5, 25)\n self.assertEqual(result, self.logfile_path)\n self.assertTrue(os.path.exists(self.logfile_path))\n def test_invalid_interval(self):\n \"\"\"\n Test the function with an invalid interval value (less than or equal to zero).\n It should raise a ValueError.\n \"\"\"\n with self.assertRaises(ValueError):\n f_871(-1, 10)\n def test_invalid_duration(self):\n \"\"\"\n Test the function with an invalid duration value (less than or equal to zero).\n It should raise a ValueError.\n \"\"\"\n with self.assertRaises(ValueError):\n f_871(5, -10)\n @patch(\"subprocess.check_output\")\n @patch(\"time.time\")\n @patch(\"platform.system\")\n def test_subprocess_output_handling_windows(\n self, mock_platform, mock_time, mock_subprocess\n ):\n \"\"\"\n Test handling of subprocess output on Windows.\n It should correctly parse the CPU usage from the subprocess output.\n \"\"\"\n mock_platform.return_value = \"Windows\"\n mock_time.side_effect = iter(range(0, 100, 5))\n mock_output = b'\"\\\\Processor(_Total)\\\\% Processor Time\",\"5.0\"\\n\\n\"2023-04-01 12:34:56.789\",\"5.0\"\\n'\n mock_subprocess.return_value = mock_output\n result = f_871(5, 10)\n self.assertEqual(result, self.logfile_path)\n @patch(\"subprocess.check_output\")\n @patch(\"time.time\")\n @patch(\"platform.system\")\n def test_subprocess_output_handling_linux(\n self, mock_platform, mock_time, mock_subprocess\n ):\n \"\"\"\n Test handling of subprocess output on Linux.\n It should correctly parse the CPU usage from the subprocess output.\n \"\"\"\n mock_platform.return_value = \"Linux\"\n mock_time.side_effect = iter(range(0, 100, 5))\n mock_output = b\"Linux 4.15.0-54-generic (ubuntu) \\nTasks: 195 total...\\n%Cpu(s): 5.0 us, 2.0 sy, 0.0 ni, 92.0 id, 0.0 wa, 0.0 hi, 1.0 si, 0.0 st\\n\"\n mock_subprocess.return_value = mock_output\n result = f_871(5, 10)\n self.assertEqual(result, self.logfile_path)\n @patch(\"builtins.open\", side_effect=IOError(\"Mocked error\"))\n def test_io_error_handling(self, mock_open):\n \"\"\"\n Test the function's behavior when an IOError occurs during file operations.\n It should handle the error and return None.\n \"\"\"\n result = f_871(5, 10)\n self.assertIsNone(result)", "apis": ["platform.system", "subprocess.check_output", "json.dump", "time.sleep", "time.time"], "libs": ["platform", "time", "json", "subprocess"], "doc": {"description": ["Monitors and logs CPU usage at specified intervals over a given duration."], "note": ["Actual run time of the function may slightly exceed the specified 'duration' due to processing time and system response delay.", "The function records the CPU usage percentage at regular intervals for a specified duration.", "The data is captured every 'interval' seconds until the 'duration' is reached or exceeded.", "Each record includes a timestamp and the CPU usage percentage at that moment.", "The data is saved in JSON format in a log file named 'logfile.log'.", "The function supports different commands for CPU usage monitoring on Windows and Unix/Linux platforms."], "params": ["interval (int): The frequency, in seconds, at which CPU usage data is captured. Must be greater than zero.", "duration (int): The total duration, in seconds, for which CPU usage is monitored. Must be greater than zero."], "returns": ["str: Path to the log file where CPU usage data is saved. Returns None if an IOError occurs during file operations."], "reqs": ["subprocess", "time", "json", "platform"], "raises": ["ValueError: If either 'interval' or 'duration' is less than or equal to zero."], "example": [">>> f_871(5, 60)", "'logfile.log'"]}} +{"task_id": "f_390", "prompt": "from datetime import datetime\nimport random\nimport matplotlib.pyplot as plt\n\n\ndef f_390(\n epoch_milliseconds,\n teams=[\"Team1\", \"Team2\", \"Team3\", \"Team4\", \"Team5\"],\n random_seed=0,\n):\n \"\"\"\n Generate and plot a performance trend for different teams from a given epoch timestamp to the current time.\n\n The performance data is generated by creating a series of random values for each day from the starting timestamp\n to the present day. Each team's performance is simulated as a random float between 0.1 and 1 for each day.\n The plot shows days since the start date on the x-axis and performance on the y-axis.\n\n Parameters:\n epoch_milliseconds (int): The epoch milliseconds from where to start the generation. Must not be in the future.\n teams (list of str, optional): Team names. If not provided, defaults to ['Team1', 'Team2', 'Team3', 'Team4', 'Team5'].\n random_seed (int, optional): Seed for random number generation to ensure reproducibility. Defaults to 0.\n\n Returns:\n dict: A dictionary containing performance data for each team, with days as indices and performance as float values.\n matplotlib.figure.Figure: A figure object showing the performance trend of each team over the days.\n\n Requirements:\n - datetime.datetime\n - random\n - matplotlib\n\n Example:\n >>> results, ax = f_390(1236472051807)\n >>> results.keys()\n dict_keys(['Team1', 'Team2', 'Team3', 'Team4', 'Team5'])\n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n random.seed(random_seed)\n\n if (not isinstance(teams, list)) or (not all(isinstance(t, str) for t in teams)):\n raise TypeError(\"Expected teams to be list of str\")\n\n start_time = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n current_time = datetime.now()\n days_diff = (current_time - start_time).days\n\n if days_diff < 0:\n raise ValueError(\"Input epoch timestamp is in the future!\")\n\n performance_data = {team: [0] * days_diff for team in teams}\n\n for i in range(days_diff):\n for team in teams:\n performance = random.uniform(0.1, 1)\n performance_data[team][i] += performance\n\n fig, ax = plt.subplots()\n for team, performance in performance_data.items():\n ax.plot(range(days_diff), performance, label=team)\n\n ax.set_xlabel(\"Days since \" + start_time.strftime(\"%Y-%m-%d %H:%M:%S\"))\n ax.set_ylabel(\"Performance\")\n ax.legend()\n\n return performance_data, fig", "test": "import unittest\nfrom datetime import datetime, timedelta\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.x = 1631295600000\n self.default_valid_teams = [\"Team1\", \"Team2\", \"Team3\", \"Team4\", \"Team5\"]\n def _check_valid_performance_data(self, performance_data, valid_teams):\n self.assertIsInstance(performance_data, dict)\n self.assertTrue(all(team in valid_teams for team in performance_data.keys()))\n for team, performances in performance_data.items():\n for performance in performances:\n self.assertTrue(\n 0.1 <= performance <= 1, f\"Performance out of range for {team}\"\n )\n self.assertIsInstance(performance, float)\n def _check_plot(self, fig):\n ax = fig.axes[0]\n self.assertIsInstance(fig, plt.Figure)\n self.assertEqual(ax.get_ylabel(), \"Performance\")\n self.assertTrue(ax.get_xlabel().startswith(\"Days since\"))\n def test_case_1(self):\n # Test basic case with default parameters - data\n performance_data, _ = f_390(self.x)\n self._check_valid_performance_data(performance_data, self.default_valid_teams)\n def test_case_2(self):\n # Test basic case with default parameters - plot\n _, fig = f_390(self.x)\n self._check_plot(fig)\n def test_case_3(self):\n # Test basic case with custom input\n performance_data, fig = f_390(1236472051807, random_seed=42)\n self._check_plot(fig)\n self._check_valid_performance_data(performance_data, self.default_valid_teams)\n def test_case_4(self):\n # Test custom parameters - custom teams\n for custom_teams in [[\"A\", \"B\"], [\"c d e\", \"F\", \"GH\", \"ij kl\"]]:\n performance_data, fig = f_390(self.x, teams=custom_teams, random_seed=42)\n self._check_plot(fig)\n self._check_valid_performance_data(performance_data, custom_teams)\n def test_case_5(self):\n # Test custom parameters - random seed\n performance_data1, _ = f_390(self.x, random_seed=42)\n performance_data2, _ = f_390(self.x, random_seed=42)\n performance_data3, _ = f_390(self.x, random_seed=0)\n self.assertEqual(performance_data1, performance_data2)\n self.assertNotEqual(performance_data1, performance_data3)\n def test_case_6(self):\n # Test error handling for invalid input time\n future_epoch = int((datetime.now() + timedelta(days=1)).timestamp() * 1000)\n with self.assertRaises(ValueError):\n f_390(future_epoch)\n def test_case_7(self):\n # Test error handling for invalid team\n with self.assertRaises(TypeError):\n f_390(self.x, [1, 2, 3])\n with self.assertRaises(TypeError):\n f_390(self.x, [[]])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["datetime.datetime.fromtimestamp", "datetime.datetime.now", "random.seed", "matplotlib.pyplot.subplots", "random.uniform"], "libs": ["random", "matplotlib", "datetime"], "doc": {"description": ["Generate and plot a performance trend for different teams from a given epoch timestamp to the current time.", "The performance data is generated by creating a series of random values for each day from the starting timestamp", "to the present day. Each team's performance is simulated as a random float between 0.1 and 1 for each day.", "The plot shows days since the start date on the x-axis and performance on the y-axis."], "note": [], "params": ["epoch_milliseconds (int): The epoch milliseconds from where to start the generation. Must not be in the future.", "teams (list of str, optional): Team names. If not provided, defaults to ['Team1', 'Team2', 'Team3', 'Team4', 'Team5'].", "random_seed (int, optional): Seed for random number generation to ensure reproducibility. Defaults to 0."], "returns": ["dict: A dictionary containing performance data for each team, with days as indices and performance as float values.", "matplotlib.figure.Figure: A figure object showing the performance trend of each team over the days."], "reqs": ["datetime.datetime", "random", "matplotlib"], "raises": [], "example": [">>> results, ax = f_390(1236472051807)", ">>> results.keys()", "dict_keys(['Team1', 'Team2', 'Team3', 'Team4', 'Team5'])", ">>> type(ax)", ""]}} +{"task_id": "f_759", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib.axes import Axes\nfrom statsmodels.tsa.arima.model import ARIMA\nfrom typing import List, Tuple\n\ndef f_759(df: pd.DataFrame) -> Tuple[List[float], Axes]:\n \"\"\"\n Forecasts the share closing prices for the next 7 days using the ARIMA model and plots the forecast.\n\n Parameters:\n df (pd.DataFrame): The input dataframe with columns 'date' and 'closing_price'. \n 'date' should be of datetime dtype and 'closing_price' should be float.\n\n Returns:\n Tuple[List[float], Axes]: A tuple containing:\n - A list with forecasted prices for the next 7 days.\n - A matplotlib Axes object containing the plot.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n - statsmodels.tsa.arima.model.ARIMA\n\n Example:\n >>> df = pd.DataFrame({\n ... 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),\n ... 'closing_price': [100, 101, 102, 103, 104, 105, 106]\n ... })\n >>> forecast, ax = f_759(df)\n >>> print(forecast)\n [106.99999813460752, 107.99999998338443, 108.99999547091295, 109.99999867405204, 110.99999292499156, 111.99999573455818, 112.9999903188028]\n \"\"\"", "canonical_solution": " # Creating the ARIMA model\n model = ARIMA(df['closing_price'], order=(5, 1, 0))\n model_fit = model.fit()\n \n # Forecasting the next 7 days\n forecast = model_fit.forecast(steps=7)\n # Plotting the forecast\n fig, ax = plt.subplots()\n ax.plot(df['date'], df['closing_price'], label='Historical Closing Prices')\n forecast_dates = pd.date_range(start=df['date'].iloc[-1] + pd.Timedelta(days=1), periods=7)\n ax.plot(forecast_dates, forecast, label='Forecasted Closing Prices')\n ax.legend()\n \n return forecast.tolist(), ax", "test": "# Importing required modules for testing\nimport unittest\nimport pandas as pd\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n # Creating a sample dataframe with closing prices for 7 days\n df1 = pd.DataFrame({\n 'date': pd.date_range(start='2022-01-01', end='2022-01-07', freq='D'),\n 'closing_price': [100, 101, 102, 103, 104, 105, 106]\n })\n \n # Running the function\n forecast1, ax1 = f_759(df1)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast1, list)\n self.assertIsInstance(ax1, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast1, [106.99999813460752, 107.99999998338443, 108.99999547091295, 109.99999867405204, 110.99999292499156, 111.99999573455818, 112.9999903188028]):\n self.assertAlmostEqual(a, b, places=3)\n \n # Checking if the plot contains data\n lines = ax1.get_lines()\n self.assertTrue(lines[0].get_ydata().tolist(), [100, 101, 102, 103, 104, 105, 106])\n def test_case_2(self):\n # Creating a sample dataframe with closing prices for 7 days\n df2 = pd.DataFrame({\n 'date': pd.date_range(start='2022-02-01', end='2022-02-07', freq='D'),\n 'closing_price': [200, 201, 202, 203, 204, 205, 206]\n })\n \n # Running the function\n forecast2, ax2 = f_759(df2)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast2, list)\n self.assertIsInstance(ax2, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast2, [206.9999997816766, 208.00000005262595, 208.99999941300158, 210.000000028273, 210.99999903094576, 211.99999982088116, 212.99999869216418]):\n self.assertAlmostEqual(a, b, places=3)\n # Checking if the plot contains data\n lines = ax2.get_lines()\n self.assertAlmostEqual(lines[0].get_ydata().tolist(), [200, 201, 202, 203, 204, 205, 206])\n def test_case_3(self):\n # Creating a sample dataframe with closing prices for 7 days\n df3 = pd.DataFrame({\n 'date': pd.date_range(start='2022-03-01', end='2022-03-07', freq='D'),\n 'closing_price': [300, 301, 302, 303, 304, 305, 306]\n })\n \n # Running the function\n forecast3, ax3 = f_759(df3)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast3, list)\n self.assertIsInstance(ax3, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast3, [306.99999853839176, 308.00000003237324, 308.9999964108992, 309.9999991004857, 310.9999943724899, 311.9999968807911, 312.99999233933994]):\n self.assertAlmostEqual(a, b, places=3)\n # Checking if the plot contains data\n lines = ax3.get_lines()\n # get data from the line\n self.assertAlmostEqual(lines[0].get_ydata().tolist(), [300, 301, 302, 303, 304, 305, 306])\n def test_case_4(self):\n # Creating a sample dataframe with closing prices for 7 days\n df4 = pd.DataFrame({\n 'date': pd.date_range(start='2022-04-01', end='2022-04-07', freq='D'),\n 'closing_price': [400, 401, 402, 403, 404, 405, 406]\n })\n \n # Running the function\n forecast4, ax4 = f_759(df4)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast4, list)\n self.assertIsInstance(ax4, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast4, [406.99999936259456, 408.0000000781549, 408.99999837145054, 409.9999998156926, 410.9999973988557, 411.99999898892963, 412.9999964967954]):\n self.assertAlmostEqual(a, b, places=3)\n # Checking if the plot contains data\n lines = ax4.get_lines()\n self.assertAlmostEqual(lines[0].get_ydata().tolist(), [400, 401, 402, 403, 404, 405, 406])\n def test_case_5(self):\n # Creating a sample dataframe with closing prices for 7 days\n df5 = pd.DataFrame({\n 'date': pd.date_range(start='2022-05-01', end='2022-05-07', freq='D'),\n 'closing_price': [500, 501, 502, 503, 504, 505, 506]\n })\n \n # Running the function\n forecast5, ax5 = f_759(df5)\n \n # Checking the type of the forecast and plot object\n self.assertIsInstance(forecast5, list)\n self.assertIsInstance(ax5, Axes)\n \n # Checking the length of the forecasted list\n for a, b in zip(forecast5, [506.99999853029163, 508.0000000310427, 508.99999639197796, 509.9999990913683, 510.9999943427388, 511.9999968573493, 512.9999922971087]):\n self.assertAlmostEqual(a, b, places=3)\n # Checking if the plot contains data\n lines = ax5.get_lines()\n self.assertTrue(lines[0].get_ydata().tolist(), [500, 501, 502, 503, 504, 505, 506])", "apis": ["pandas.Timedelta", "pandas.DataFrame", "pandas.date_range", "matplotlib.pyplot.subplots", "statsmodels.tsa.arima.model.ARIMA"], "libs": ["statsmodels", "pandas", "matplotlib"], "doc": {"description": ["Forecasts the share closing prices for the next 7 days using the ARIMA model and plots the forecast."], "note": [], "params": ["df (pd.DataFrame): The input dataframe with columns 'date' and 'closing_price'.", "'date' should be of datetime dtype and 'closing_price' should be float."], "returns": ["Tuple[List[float], Axes]: A tuple containing:", "A list with forecasted prices for the next 7 days.", "A matplotlib Axes object containing the plot."], "reqs": ["pandas", "numpy", "matplotlib.pyplot", "statsmodels.tsa.arima.model.ARIMA"], "raises": [], "example": [">>> df = pd.DataFrame({", "... 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),", "... 'closing_price': [100, 101, 102, 103, 104, 105, 106]", "... })", ">>> forecast, ax = f_759(df)", ">>> print(forecast)", "[106.99999813460752, 107.99999998338443, 108.99999547091295, 109.99999867405204, 110.99999292499156, 111.99999573455818, 112.9999903188028]"]}} +{"task_id": "f_333", "prompt": "import pandas as pd\nfrom sklearn.model_selection import train_test_split\n\n\ndef f_333(df, target_column, column_to_remove=\"c\", test_size=0.2):\n \"\"\"\n Split the data into train and test datasets after removing a specified column if it exists.\n\n Parameters:\n - df (dict): The input dataframe.\n - target_column (str): The name of the target column.\n - column_to_remove (str): The name of the column to remove. Defaults to 'c'.\n - test_size (float): The ratio of test data in split output. Defaults to .2.\n\n Returns:\n - X_train (pd.DataFrame): Split features for training.\n - X_test (pd.DataFrame): Split features for testing.\n - y_train (pd.Series): Split target values for training.\n - y_test (pd.Series): Split target values for testing.\n\n Requirements:\n - pandas\n - sklearn\n\n Examples:\n >>> data = {\n ... 'a': [1, 2, 3, 4],\n ... 'b': [5, 6, 7, 8],\n ... 'c': [9, 10, 11, 12],\n ... 'target': [0, 1, 0, 1]\n ... }\n >>> X_train, _, _, _ = f_333(data, 'target')\n >>> type(X_train), X_train.shape\n (, (3, 2))\n >>> data = {\n ... 'x1': [10, 20, 30, 40],\n ... 'x2': [50, 60, 70, 80],\n ... 'x3': [90, 100, 110, 120],\n ... 'outcome': [1, 2, 3, 4]\n ... }\n >>> df2 = pd.DataFrame(data)\n >>> _, _, _, y_test = f_333(df2, 'outcome', 'x3', .25)\n >>> type(y_test), y_test.shape\n (, (1,))\n \"\"\"", "canonical_solution": " df = pd.DataFrame(df)\n # Drop the specified column if it exists in the dataframe\n if column_to_remove in df.columns:\n df = df.drop(columns=column_to_remove)\n\n # Split the dataframe into training and test datasets\n X_train, X_test, y_train, y_test = train_test_split(\n df.drop(columns=target_column), df[target_column], test_size=test_size\n )\n\n return X_train, X_test, y_train, y_test", "test": "import unittest\nimport pandas as pd\nfrom sklearn.utils._param_validation import InvalidParameterError\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # basic test dataframe\n self.df = {\"a\": [1, 2, 3, 4, 5], \"b\": [4, 5, 6, 7, 8], \"c\": [7, 8, 9, 10, 11]}\n def shape_testing_helper(self, expected_train_len, expected_test_len, split_data):\n X_train, X_test, y_train, y_test = split_data\n self.assertTrue(len(X_train) == expected_train_len)\n self.assertTrue(len(y_train) == expected_train_len)\n self.assertTrue(len(X_test) == expected_test_len)\n self.assertTrue(len(y_test) == expected_test_len)\n def test_case_1(self):\n # Dataframe with a 'c' column to be removed\n X_train, X_test, y_train, y_test = f_333(self.df, \"b\")\n self.assertEqual(\"a\", X_train.columns[0])\n self.assertEqual(\"b\", y_train.name)\n self.assertNotIn(\"c\", X_train.columns)\n self.shape_testing_helper(4, 1, (X_train, X_test, y_train, y_test))\n def test_case_2(self):\n # Specify removal of separate column\n X_train, X_test, y_train, y_test = f_333(self.df, \"a\", column_to_remove=\"b\")\n self.assertEqual(\"c\", X_train.columns[0])\n self.assertEqual(\"a\", y_train.name)\n self.assertNotIn(\"b\", X_train.columns)\n self.shape_testing_helper(4, 1, (X_train, X_test, y_train, y_test))\n def test_case_3(self):\n # Dataframe doesn't have column to be removed\n X_train, X_test, y_train, y_test = f_333(self.df, \"a\", column_to_remove=\"FOO\")\n self.assertEqual(\"a\", y_train.name)\n self.assertIn(\"b\", X_train.columns)\n self.assertIn(\"c\", X_train.columns)\n self.shape_testing_helper(4, 1, (X_train, X_test, y_train, y_test))\n def test_case_4(self):\n # Change testing ratio\n X_train, X_test, y_train, y_test = f_333(self.df, \"a\", test_size=0.8)\n self.shape_testing_helper(1, 4, (X_train, X_test, y_train, y_test))\n def test_case_5(self):\n # Should fail if specify invalid ratio\n with self.assertRaises(InvalidParameterError):\n f_333(self.df, \"a\", test_size=-999)\n with self.assertRaises(InvalidParameterError):\n f_333(self.df, \"a\", test_size=\"foo\")\n def test_case_6(self):\n # Testing with a dataframe having mixed data types\n df = {\n \"a\": [pd.NA, 2.3, 3.4, 4.5, 5.5],\n \"b\": [\"one\", \"two\", pd.NA, \"four\", \"five\"],\n \"c\": [True, False, True, False, pd.NA],\n }\n X_train, X_test, y_train, y_test = f_333(df, \"b\")\n self.assertNotIn(\"c\", X_train.columns)\n self.shape_testing_helper(4, 1, (X_train, X_test, y_train, y_test))", "apis": ["pandas.DataFrame", "sklearn.model_selection.train_test_split"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Split the data into train and test datasets after removing a specified column if it exists."], "note": [], "params": ["df (dict): The input dataframe.", "target_column (str): The name of the target column.", "column_to_remove (str): The name of the column to remove. Defaults to 'c'.", "test_size (float): The ratio of test data in split output. Defaults to .2."], "returns": ["X_train (pd.DataFrame): Split features for training.", "X_test (pd.DataFrame): Split features for testing.", "y_train (pd.Series): Split target values for training.", "y_test (pd.Series): Split target values for testing."], "reqs": ["pandas", "sklearn"], "raises": [], "example": ["Examples:", ">>> data = {", "... 'a': [1, 2, 3, 4],", "... 'b': [5, 6, 7, 8],", "... 'c': [9, 10, 11, 12],", "... 'target': [0, 1, 0, 1]", "... }", ">>> X_train, _, _, _ = f_333(data, 'target')", ">>> type(X_train), X_train.shape", "(, (3, 2))", ">>> data = {", "... 'x1': [10, 20, 30, 40],", "... 'x2': [50, 60, 70, 80],", "... 'x3': [90, 100, 110, 120],", "... 'outcome': [1, 2, 3, 4]", "... }", ">>> df2 = pd.DataFrame(data)", ">>> _, _, _, y_test = f_333(df2, 'outcome', 'x3', .25)", ">>> type(y_test), y_test.shape", "(, (1,))"]}} +{"task_id": "f_756", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\n\ndef f_756(df):\n \"\"\"\n Predicts the stock closing prices for the next 7 days using simple linear regression and plots the data.\n\n Parameters:\n df (DataFrame): The input dataframe with columns 'date' and 'closing_price'. 'date' should be in datetime format.\n\n Returns:\n tuple: A tuple containing:\n - list: A list with predicted prices for the next 7 days.\n - Axes: The matplotlib Axes object containing the plot.\n \n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n - sklearn.linear_model.LinearRegression\n\n Constants:\n - The function uses a constant time step of 24*60*60 seconds to generate future timestamps.\n\n Example:\n >>> df = pd.DataFrame({\n ... 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),\n ... 'closing_price': [100, 101, 102, 103, 104, 105, 106]\n ... })\n >>> pred_prices, plot = f_756(df)\n >>> print(pred_prices)\n [107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0]\n \"\"\"", "canonical_solution": " # Convert date to timestamp\n df['date'] = pd.to_datetime(df['date'])\n df['date'] = df['date'].map(pd.Timestamp.timestamp)\n \n # Prepare data\n X = df['date'].values.reshape(-1, 1)\n y = df['closing_price'].values\n \n # Fit model\n model = LinearRegression()\n model.fit(X, y)\n \n # Predict future prices\n future_dates = np.array([df['date'].max() + i*24*60*60 for i in range(1, 8)]).reshape(-1, 1)\n pred_prices = model.predict(future_dates)\n \n # Plot\n fig, ax = plt.subplots()\n ax.scatter(df['date'], df['closing_price'], color='black')\n ax.plot(future_dates, pred_prices, color='blue', linewidth=3)\n \n return pred_prices.tolist(), ax", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),\n 'closing_price': [100, 101, 102, 103, 104, 105, 106]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')\n def test_case_2(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='2/1/2021', end='2/7/2021'),\n 'closing_price': [200, 201, 202, 203, 204, 205, 206]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [207.0, 208.0, 209.0, 210.0, 211.0, 212.0, 213.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')\n def test_case_3(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='3/1/2021', end='3/7/2021'),\n 'closing_price': [300, 301, 302, 303, 304, 305, 306]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [307.0, 308.0, 309.0, 310.0, 311.0, 312.0, 313.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')\n def test_case_4(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='4/1/2021', end='4/7/2021'),\n 'closing_price': [400, 401, 402, 403, 404, 405, 406]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [407.0, 408.0, 409.0, 410.0, 411.0, 412.0, 413.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')\n def test_case_5(self):\n df = pd.DataFrame({\n 'date': pd.date_range(start='5/1/2021', end='5/7/2021'),\n 'closing_price': [500, 501, 502, 503, 504, 505, 506]\n })\n pred_prices, ax = f_756(df)\n self.assertEqual(pred_prices, [507.0, 508.0, 509.0, 510.0, 511.0, 512.0, 513.0])\n self.assertEqual(ax.get_xlabel(), '')\n self.assertEqual(ax.get_ylabel(), '')", "apis": ["pandas.to_datetime", "numpy.array", "pandas.Timestamp", "matplotlib.pyplot.subplots", "sklearn.linear_model.LinearRegression"], "libs": ["sklearn", "numpy", "pandas", "matplotlib"], "doc": {"description": ["Predicts the stock closing prices for the next 7 days using simple linear regression and plots the data.", "Constants:", "- The function uses a constant time step of 24*60*60 seconds to generate future timestamps."], "note": [], "params": ["df (DataFrame): The input dataframe with columns 'date' and 'closing_price'. 'date' should be in datetime format."], "returns": ["tuple: A tuple containing:", "list: A list with predicted prices for the next 7 days.", "Axes: The matplotlib Axes object containing the plot."], "reqs": ["pandas", "numpy", "matplotlib.pyplot", "sklearn.linear_model.LinearRegression"], "raises": [], "example": [">>> df = pd.DataFrame({", "... 'date': pd.date_range(start='1/1/2021', end='1/7/2021'),", "... 'closing_price': [100, 101, 102, 103, 104, 105, 106]", "... })", ">>> pred_prices, plot = f_756(df)", ">>> print(pred_prices)", "[107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0]"]}} +{"task_id": "f_860", "prompt": "import pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report\n\n\ndef f_860(csv_file_path, target_column=\"target\", test_size=0.2, n_estimators=100):\n \"\"\"\n Processes a CSV file to train a Random Forest classifier and generates a formatted classification report.\n\n Parameters:\n csv_file_path (str): The path to the CSV file containing the data.\n target_column (str, optional): The name of the target variable column. Defaults to 'target'.\n test_size (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.2.\n n_estimators (int, optional): The number of trees in the RandomForestClassifier. Defaults to 100.\n\n Returns:\n str: A formatted classification report. The report includes metrics such as precision, recall,\n f1-score for each class, as well as overall accuracy, macro average, and weighted average.\n\n Raises:\n ValueError: If the specified target_column is not found in the CSV file.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> report = f_860('/path/to/data.csv')\n >>> print(report)\n class 0 0.88 0.90 0.89 50\n class 1 0.89 0.87 0.88 48\n ...\n accuracy 0.89 100\n macro avg 0.88 0.89 0.88 100\n weighted avg 0.89 0.89 0.89 100\n\n Note:\n The CSV file must have a column with the name specified by 'target_column', and it should be in a\n format readable by pandas.read_csv().\n \"\"\"", "canonical_solution": " df = pd.read_csv(csv_file_path)\n if target_column not in df.columns:\n raise ValueError(f\"'{target_column}' column not found in the CSV file.\")\n\n X = df.drop(target_column, axis=1)\n y = df[target_column]\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=test_size, random_state=42\n )\n clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)\n clf.fit(X_train, y_train)\n y_pred = clf.predict(X_test)\n report = classification_report(y_test, y_pred)\n\n # New formatting approach\n lines = report.split(\"\\n\")\n formatted_lines = []\n for line in lines:\n # Split the line into words and rejoin with specific spacing\n parts = line.split()\n if len(parts) == 5: # Class-specific metrics\n formatted_line = f\"{parts[0]:<15}{parts[1]:>10}{parts[2]:>10}{parts[3]:>10}{parts[4]:>10}\"\n elif len(parts) == 4: # Overall metrics\n formatted_line = f\"{parts[0]:<15}{parts[1]:>10}{parts[2]:>10}{parts[3]:>10}\"\n else:\n formatted_line = line # Header or empty lines\n formatted_lines.append(formatted_line)\n\n formatted_report = \"\\n\".join(formatted_lines)\n return formatted_report", "test": "import unittest\nfrom unittest.mock import patch\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_860.\"\"\"\n @patch(\"pandas.read_csv\")\n def test_default_parameters(self, mock_read_csv):\n \"\"\"\n Test f_860 with default parameters using an adequately sized mock dataset.\n \"\"\"\n mock_data = {\n \"feature1\": range(100),\n \"feature2\": range(100, 200),\n \"target\": [0, 1] * 50, # Alternating 0s and 1s\n }\n mock_read_csv.return_value = pd.DataFrame(mock_data)\n result = f_860(\"dummy_path.csv\")\n self.assertIn(\"precision\", result)\n @patch(\"pandas.read_csv\")\n def test_non_default_target_column(self, mock_read_csv):\n \"\"\"\n Test f_860 with a non-default target column using a larger mock dataset.\n \"\"\"\n mock_data = {\n \"feature1\": range(100),\n \"feature2\": range(100, 200),\n \"label\": [1, 0] * 50, # Alternating 1s and 0s\n }\n mock_read_csv.return_value = pd.DataFrame(mock_data)\n result = f_860(\"dummy_path.csv\", target_column=\"label\")\n self.assertIn(\"precision\", result)\n @patch(\"pandas.read_csv\")\n def test_different_test_size(self, mock_read_csv):\n \"\"\"\n Test f_860 with a different test size and a larger dataset.\n \"\"\"\n mock_data = {\n \"feature1\": range(100),\n \"feature2\": range(100, 200),\n \"target\": [0, 1, 1, 0] * 25, # Repeated pattern\n }\n mock_read_csv.return_value = pd.DataFrame(mock_data)\n result = f_860(\"dummy_path.csv\", test_size=0.5)\n self.assertIn(\"precision\", result)\n @patch(\"pandas.read_csv\")\n def test_different_n_estimators(self, mock_read_csv):\n \"\"\"\n Test f_860 with a different number of estimators and an expanded dataset.\n \"\"\"\n mock_data = {\n \"feature1\": range(100),\n \"feature2\": range(100, 200),\n \"target\": [1, 0] * 50, # Alternating 1s and 0s\n }\n mock_read_csv.return_value = pd.DataFrame(mock_data)\n result = f_860(\"dummy_path.csv\", n_estimators=50)\n self.assertIn(\"precision\", result)\n @patch(\"pandas.read_csv\")\n def test_missing_target_column(self, mock_read_csv):\n \"\"\"\n Test f_860 with a missing target column.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame(\n {\"feature1\": [1, 2], \"feature2\": [3, 4]}\n )\n with self.assertRaises(ValueError):\n f_860(\"dummy_path.csv\", target_column=\"not_exist\")", "apis": ["pandas.read_csv", "sklearn.model_selection.train_test_split", "sklearn.metrics.classification_report", "sklearn.ensemble.RandomForestClassifier"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Processes a CSV file to train a Random Forest classifier and generates a formatted classification report."], "note": ["The CSV file must have a column with the name specified by 'target_column', and it should be in a", "format readable by pandas.read_csv()."], "params": ["csv_file_path (str): The path to the CSV file containing the data.", "target_column (str, optional): The name of the target variable column. Defaults to 'target'.", "test_size (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.2.", "n_estimators (int, optional): The number of trees in the RandomForestClassifier. Defaults to 100."], "returns": ["str: A formatted classification report. The report includes metrics such as precision, recall,", "f1-score for each class, as well as overall accuracy, macro average, and weighted average."], "reqs": ["pandas", "sklearn"], "raises": ["ValueError: If the specified target_column is not found in the CSV file."], "example": [">>> report = f_860('/path/to/data.csv')", ">>> print(report)", "class 0 0.88 0.90 0.89 50", "class 1 0.89 0.87 0.88 48", "...", "accuracy 0.89 100", "macro avg 0.88 0.89 0.88 100", "weighted avg 0.89 0.89 0.89 100"]}} +{"task_id": "f_832", "prompt": "import random\nimport string\n\n\ndef f_832(length: int, predicates: list, seed: int = None):\n \"\"\"\n Generates a random string of specified length and evaluates it for specific characteristics.\n\n Parameters:\n - length (int): Desired length of the generated string.\n - predicates (list of strings): Conditions to evaluate the string.\n Must contain options from 'has_uppercase', 'has_lowercase', 'has_special_chars', 'has_numbers'.\n - seed (int, optional): Seed for the random number generator for reproducibility.\n\n Returns:\n - tuple:\n - string: the generated random text\n - dict: the text's characteristics\n\n Raises:\n - ValueError: If the specified length is negative.\n - KeyError: If any predicate is not recognized.\n\n Notes:\n - Predicates are deduplicated.\n - Characters are randomly sampled from string ascii_letters, digits, and punctuation with replacement.\n - Any invalid predicates provided will result in a KeyError.\n - If no predicates are provided, the result dictionary will be empty.\n\n Requirements:\n - string\n - random\n\n Example:\n >>> f_832(10, ['has_uppercase', 'has_numbers'], seed=42)\n ('8czu(\"@iNc', {'has_uppercase': True, 'has_numbers': True})\n >>> f_832(5, ['has_lowercase'], seed=123)\n ('eiMk[', {'has_lowercase': True})\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n if length < 0:\n raise ValueError(\"Length must be non-negative.\")\n\n predicate_functions = {\n \"has_uppercase\": lambda x: any(c.isupper() for c in x),\n \"has_lowercase\": lambda x: any(c.islower() for c in x),\n \"has_special_chars\": lambda x: any(c in string.punctuation for c in x),\n \"has_numbers\": lambda x: any(c.isdigit() for c in x),\n }\n\n predicates = list(set(predicates))\n if any(p not in predicate_functions for p in predicates):\n raise KeyError(f\"Invalid predicate provided.\")\n\n characters = string.ascii_letters + string.digits + string.punctuation\n generated_string = \"\".join(random.choices(characters, k=length))\n\n results = {\n predicate: predicate_functions[predicate](generated_string)\n for predicate in predicates\n }\n\n return generated_string, results", "test": "import unittest\nimport string\nclass TestCases(unittest.TestCase):\n def test_valid_length_and_predicates(self):\n result_str, result_dict = f_832(\n 10,\n [\"has_uppercase\", \"has_lowercase\", \"has_numbers\", \"has_special_chars\"],\n seed=1,\n )\n self.assertEqual(len(result_str), 10)\n self.assertTrue(result_dict[\"has_uppercase\"])\n self.assertTrue(result_dict[\"has_lowercase\"])\n self.assertTrue(result_dict[\"has_numbers\"])\n self.assertTrue(result_dict[\"has_special_chars\"])\n def test_result_correctness(self):\n n_repetitions = 1000\n for _ in range(n_repetitions):\n result_str, result_dict = f_832(\n 10,\n [\"has_uppercase\", \"has_lowercase\", \"has_numbers\", \"has_special_chars\"],\n seed=1,\n )\n if any(c.isupper() for c in result_str):\n self.assertTrue(result_dict[\"has_uppercase\"])\n if any(c.islower() for c in result_str):\n self.assertTrue(result_dict[\"has_lowercase\"])\n if any(c in string.punctuation for c in result_str):\n self.assertTrue(result_dict[\"has_special_chars\"])\n if any(c.isdigit() for c in result_str):\n self.assertTrue(result_dict[\"has_numbers\"])\n def test_empty_string(self):\n result_str, result_dict = f_832(0, [\"has_uppercase\", \"has_numbers\"], seed=3)\n self.assertEqual(result_str, \"\")\n self.assertFalse(result_dict[\"has_uppercase\"])\n self.assertFalse(result_dict[\"has_numbers\"])\n def test_negative_length(self):\n with self.assertRaises(ValueError):\n f_832(-1, [\"has_uppercase\"])\n def test_no_predicates(self):\n result_str, result_dict = f_832(10, [], seed=5)\n self.assertEqual(len(result_str), 10)\n self.assertEqual(result_dict, {})\n def test_key_error(self):\n with self.assertRaises(KeyError):\n f_832(10, [\"has_uppercase\", \"invalid\"])\n def test_deduplicate_predicates(self):\n _, result_dict = f_832(15, [\"has_uppercase\", \"has_uppercase\"], seed=7)\n self.assertEqual(len(result_dict), 1)\n def test_random_seed_reproducibility(self):\n result_str1, result_dict1 = f_832(10, [\"has_uppercase\", \"has_numbers\"], seed=8)\n result_str2, result_dict2 = f_832(10, [\"has_uppercase\", \"has_numbers\"], seed=8)\n self.assertEqual(result_str1, result_str2)\n self.assertEqual(result_dict1, result_dict2)", "apis": ["string.ascii_letters", "random.seed", "string.digits", "string.punctuation", "random.choices"], "libs": ["random", "string"], "doc": {"description": ["Generates a random string of specified length and evaluates it for specific characteristics.", "Notes:", "- Predicates are deduplicated.", "- Characters are randomly sampled from string ascii_letters, digits, and punctuation with replacement.", "- Any invalid predicates provided will result in a KeyError.", "- If no predicates are provided, the result dictionary will be empty."], "note": [], "params": ["length (int): Desired length of the generated string.", "predicates (list of strings): Conditions to evaluate the string.", "Must contain options from 'has_uppercase', 'has_lowercase', 'has_special_chars', 'has_numbers'.", "seed (int, optional): Seed for the random number generator for reproducibility."], "returns": ["tuple:", "string: the generated random text", "dict: the text's characteristics"], "reqs": ["string", "random"], "raises": ["ValueError: If the specified length is negative.", "KeyError: If any predicate is not recognized."], "example": [">>> f_832(10, ['has_uppercase', 'has_numbers'], seed=42)", "('8czu(\"@iNc', {'has_uppercase': True, 'has_numbers': True})", ">>> f_832(5, ['has_lowercase'], seed=123)", "('eiMk[', {'has_lowercase': True})"]}} +{"task_id": "f_395", "prompt": "from datetime import datetime, timedelta\nimport pandas as pd\nimport random\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_395(days_in_past=7, random_seed=0):\n \"\"\"\n Generates a graph of daily activity durations for a specified number of days in the past\n using randomly generated data for activities.\n\n This function randomly generates acitivity durations from 0 to 120 for each activity\n from [\"Running\", \"Swimming\", \"Cycling\", \"Yoga\", \"Weight Training\"].\n\n Parameters:\n days_in_past (int, optional): The number of days in the past for which to generate the graph.\n Defaults to 7 days. Must be in the past.\n random_seed (int, optional): Seed for random number generation to ensure reproducibility.\n Defaults to 0.\n\n Returns:\n Tuple containing\n - ax (plt.Axes): DataFrame used for plotting.\n - df (pd.DataFrame): Seaborn lineplot with date on the x-axis, duration on the y-axis, and activity as hue.\n\n Requirements:\n - datetime.datetime\n - datetime.timedelta\n - pandas\n - random\n - seaborn\n\n Example:\n >>> ax, df = f_395(7, random_seed=42)\n >>> type(ax)\n \n\n A sample row from the returned DataFrame might look like:\n Date Activity Duration\n YYYY-MM-DD Running 45\n \"\"\"", "canonical_solution": "\n random.seed(random_seed)\n\n if days_in_past < 1:\n raise ValueError(\"days_in_past must be in the past\")\n\n ACTIVITIES = [\"Running\", \"Swimming\", \"Cycling\", \"Yoga\", \"Weight Training\"]\n\n data = []\n for i in range(days_in_past):\n date = datetime.now().date() - timedelta(days=i)\n for activity in ACTIVITIES:\n duration = random.randint(0, 120)\n data.append([date, activity, duration])\n\n df = pd.DataFrame(data, columns=[\"Date\", \"Activity\", \"Duration\"])\n ax = sns.lineplot(data=df, x=\"Date\", y=\"Duration\", hue=\"Activity\")\n return ax, df", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.default_days_in_past = 7\n self.default_activities = [\n \"Running\",\n \"Swimming\",\n \"Cycling\",\n \"Yoga\",\n \"Weight Training\",\n ]\n def _check_df(self, df, days_in_past):\n self.assertEqual(set(df.columns), {\"Duration\", \"Activity\", \"Date\"})\n self.assertTrue((df[\"Duration\"] >= 0).all() and (df[\"Duration\"] <= 120).all())\n self.assertEqual(len(df[\"Date\"].unique()), days_in_past)\n def _check_plot(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n legend_labels = [t.get_text() for t in ax.get_legend().get_texts()]\n for activity in self.default_activities:\n self.assertIn(activity, legend_labels)\n def test_case_1(self):\n # Test using default parameters\n ax, df = f_395()\n self._check_df(df, self.default_days_in_past)\n self._check_plot(ax)\n def test_case_2(self):\n # Test using custom parameters\n ax, df = f_395(10, random_seed=2)\n self._check_df(df, 10)\n self._check_plot(ax)\n def test_case_3(self):\n # Test days_in_past\n for ndays in [1, 5, 10, 100, 500]:\n _, df = f_395(ndays)\n self.assertEqual(len(df[\"Date\"].unique()), ndays)\n def test_case_4(self):\n # Test random seed\n _, df1 = f_395(10, random_seed=4)\n _, df2 = f_395(10, random_seed=4)\n _, df3 = f_395(10, random_seed=0)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertFalse(df2.equals(df3))\n def test_case_5(self):\n # Test handling invalid days in past\n with self.assertRaises(ValueError):\n f_395(0, random_seed=5)\n with self.assertRaises(ValueError):\n f_395(-1, random_seed=5)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "datetime.datetime.now", "random.randint", "seaborn.lineplot", "random.seed", "datetime.timedelta"], "libs": ["random", "seaborn", "pandas", "datetime"], "doc": {"description": ["Generates a graph of daily activity durations for a specified number of days in the past", "using randomly generated data for activities.", "This function randomly generates acitivity durations from 0 to 120 for each activity", "from [\"Running\", \"Swimming\", \"Cycling\", \"Yoga\", \"Weight Training\"].", "A sample row from the returned DataFrame might look like:", "Date Activity Duration", "YYYY-MM-DD Running 45"], "note": [], "params": ["days_in_past (int, optional): The number of days in the past for which to generate the graph.", "Defaults to 7 days. Must be in the past.", "random_seed (int, optional): Seed for random number generation to ensure reproducibility.", "Defaults to 0."], "returns": ["Tuple containing", "ax (plt.Axes): DataFrame used for plotting.", "df (pd.DataFrame): Seaborn lineplot with date on the x-axis, duration on the y-axis, and activity as hue."], "reqs": ["datetime.datetime", "datetime.timedelta", "pandas", "random", "seaborn"], "raises": [], "example": [">>> ax, df = f_395(7, random_seed=42)", ">>> type(ax)", ""]}} +{"task_id": "f_831", "prompt": "import os\nimport re\nfrom pathlib import Path\n\n\ndef f_831(dir_path: str, predicates: list) -> dict:\n \"\"\"\n Evaluates each item (files and directories) in a given directory against specified conditions.\n\n Parameters:\n - dir_path (str): The path to the directory to be evaluated. Must exist.\n - predicates (list of strings): Names of conditions to check for.\n Must contain valid conditions. Invalid conditions are ignored.\n Supported conditions:\n 1. 'is_file': whether the item is a file\n 2. 'is_dir': whether the item is a directory\n 3. 'has_special_chars': whether the item name contains a character that\n is not a letter, digit, or underscore, ignoring file extensions\n 4. 'has_numbers': whether the item name contains a number\n\n Returns:\n - dict: A dictionary with directory items as keys and the results of condition checks as values.\n\n Raises:\n - ValueError: If no valid predicates are provided.\n - FileNotFoundError: If the specified directory does not exist or is not a directory.\n\n Note:\n - This function evaluates file/directory names, rather than their full path.\n - Predicates are deduplicated.\n\n Requirements:\n - os\n - re\n - pathlib\n\n Examples:\n >>> f_831('/path/to/dir', ['is_file', 'has_numbers'])\n {'file.txt': {'is_file': True, 'has_numbers': False}, 'file2.txt': {'is_file': True, 'has_numbers': True}}\n >>> f_831('/path/to/dir', ['is_dir', 'has_special_chars'])\n {'my_folder': {'is_dir': True, 'has_special_chars': False}, 'a_@Folder': {'is_dir': True, 'has_special_chars': True}}\n \"\"\"", "canonical_solution": " predicate_functions = {\n \"is_file\": lambda x: x.is_file(),\n \"is_dir\": lambda x: x.is_dir(),\n \"has_special_chars\": lambda x: bool(re.search(r\"\\W\", x.stem)),\n \"has_numbers\": lambda x: bool(re.search(r\"\\d\", x.name)),\n }\n predicates = [p for p in set(predicates) if p in predicate_functions]\n if not predicates:\n raise ValueError(\"No valid predicates provided.\")\n\n if not os.path.exists(dir_path) or not os.path.isdir(dir_path):\n raise FileNotFoundError(\n f\"The directory {dir_path} does not exist or is not a directory.\"\n )\n\n results = {}\n for item in os.listdir(dir_path):\n full_path = Path(os.path.join(dir_path, item))\n results[item] = {\n predicate_name: predicate_fn(full_path)\n for predicate_name, predicate_fn in predicate_functions.items()\n if predicate_name in predicates\n }\n return results", "test": "import unittest\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = TemporaryDirectory()\n self.test_dir = self.temp_dir.name\n self.fields = [\n \"is_file\",\n \"is_dir\",\n \"has_special_chars\",\n \"has_numbers\",\n ]\n self.is_file_fns = [\n \"file\",\n \"file.txt\",\n \"file1.txt\",\n \"somefile\",\n ]\n self.is_dir_fns = [\"somedir\", \"aDirectory123\"]\n def tearDown(self):\n self.temp_dir.cleanup()\n def helper_make_data(self, name, is_dir=False):\n # Helper function to make test files\n if is_dir:\n Path(os.path.join(self.test_dir, name)).mkdir()\n else:\n Path(os.path.join(self.test_dir, name)).touch()\n def helper_assert_predicate(self, results, predicates):\n # Helper to check only specified predicates are returned\n num_predicates = len(predicates)\n self.assertTrue(all(len(r) == num_predicates for r in results.values()))\n self.assertTrue(\n all(predicate in r for r in results.values() for predicate in predicates)\n )\n def test_file_is_file(self):\n field = \"is_file\"\n for fn in self.is_file_fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in self.is_file_fns:\n self.assertTrue(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_file_is_not_dir(self):\n field = \"is_dir\"\n for fn in self.is_file_fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in self.is_file_fns:\n self.assertFalse(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_dir_is_dir(self):\n field = \"is_dir\"\n for fn in self.is_dir_fns:\n self.helper_make_data(fn, is_dir=True)\n result = f_831(str(self.test_dir), [field])\n for fn in self.is_dir_fns:\n self.assertTrue(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_dir_is_not_file(self):\n field = \"is_file\"\n for fn in self.is_dir_fns:\n self.helper_make_data(fn, is_dir=True)\n result = f_831(str(self.test_dir), [field])\n for fn in self.is_dir_fns:\n self.assertFalse(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_has_special_char(self):\n field = \"has_special_chars\"\n fns = [\"fi!e\", \"fi@\", \"f.ile.txt\"]\n for fn in fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in fns:\n self.assertTrue(result[fn][field], result)\n self.helper_assert_predicate(result, [field])\n def test_has_no_special_char(self):\n field = \"has_special_chars\"\n fns = [\"file_\", \"_file\", \"file.txt\", \"some_file.txt\"]\n for fn in fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in fns:\n self.assertFalse(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_has_numbers(self):\n field = \"has_numbers\"\n fns = [\"123\", \"123.txt\", \"text123\", \"t1e2x3t4\"]\n for fn in fns:\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [field])\n for fn in fns:\n self.assertTrue(result[fn][field])\n self.helper_assert_predicate(result, [field])\n def test_multiple_predicates(self):\n fn = \"test1!.txt\"\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), self.fields)\n self.helper_assert_predicate(result, self.fields)\n self.assertTrue(result[fn][\"is_file\"])\n self.assertFalse(result[fn][\"is_dir\"])\n self.assertTrue(result[fn][\"has_special_chars\"])\n self.assertTrue(result[fn][\"has_numbers\"])\n def test_deduplicate_predicates(self):\n fn = \"test_file\"\n self.helper_make_data(fn, is_dir=False)\n result = f_831(str(self.test_dir), [\"is_file\", \"is_file\"])\n self.assertTrue(len(result) == 1)\n self.helper_assert_predicate(result, [\"is_file\"])\n def test_empty_predicates(self):\n with self.assertRaises(ValueError):\n f_831(str(self.test_dir), [])\n def test_invalid_predicates(self):\n with self.assertRaises(ValueError):\n f_831(str(self.test_dir), [\"foo\", \"bar\"])\n def test_nonexistent_directory_error(self):\n with self.assertRaises(FileNotFoundError):\n f_831(\"nonexistent_dir\", [\"is_file\"])", "apis": ["os.listdir", "re.search", "os.path", "pathlib.Path", "os.path.isdir", "os.path.join", "os.path.exists"], "libs": ["os", "pathlib", "re"], "doc": {"description": ["Evaluates each item (files and directories) in a given directory against specified conditions."], "note": ["This function evaluates file/directory names, rather than their full path.", "Predicates are deduplicated."], "params": ["dir_path (str): The path to the directory to be evaluated. Must exist.", "predicates (list of strings): Names of conditions to check for.", "Must contain valid conditions. Invalid conditions are ignored.", "Supported conditions:", "1. 'is_file': whether the item is a file", "2. 'is_dir': whether the item is a directory", "3. 'has_special_chars': whether the item name contains a character that", "is not a letter, digit, or underscore, ignoring file extensions", "4. 'has_numbers': whether the item name contains a number"], "returns": ["dict: A dictionary with directory items as keys and the results of condition checks as values."], "reqs": ["os", "re", "pathlib"], "raises": ["ValueError: If no valid predicates are provided.", "FileNotFoundError: If the specified directory does not exist or is not a directory."], "example": ["Examples:", ">>> f_831('/path/to/dir', ['is_file', 'has_numbers'])", "{'file.txt': {'is_file': True, 'has_numbers': False}, 'file2.txt': {'is_file': True, 'has_numbers': True}}", ">>> f_831('/path/to/dir', ['is_dir', 'has_special_chars'])", "{'my_folder': {'is_dir': True, 'has_special_chars': False}, 'a_@Folder': {'is_dir': True, 'has_special_chars': True}}"]}} +{"task_id": "f_921", "prompt": "from datetime import datetime\nimport pytz\nimport numpy as np\n\n\ndef f_921(time_strings, timezone):\n \"\"\"\n Calculates the average time difference in seconds between each consecutive pair of timestamps\n in a given list, after converting them to a specified timezone.\n\n Parameters:\n - time_strings (list of str): A list of timestamp strings in the format 'dd/mm/yy HH:MM:SS.fff'.\n - timezone (str): The timezone to which the timestamp strings should be converted.\n This should be a valid timezone string, e.g., 'America/New_York'.\n\n Returns:\n - float: The mean (average) time difference in seconds between each consecutive pair of timestamps.\n If there are less than two timestamps in the list, the function returns 0.0.\n\n Requirements:\n - datetime\n - pytz\n - numpy\n\n Notes:\n - The function first converts each timestamp in the list to the specified timezone.\n - It then calculates the absolute time difference in seconds between each consecutive pair of timestamps.\n - If the list contains less than two timestamps, the function returns 0.0, as there are no pairs to compare.\n - If there are no time differences (e.g., in case of a single timestamp after timezone conversion), it also returns 0.0.\n - The function uses numpy's mean function to calculate the average time difference.\n\n Example:\n >>> time_strings = ['30/03/09 16:31:32.123', '30/03/09 16:32:33.123', '30/03/09 16:33:34.123']\n >>> mean_diff = f_921(time_strings, 'America/New_York')\n >>> print(mean_diff)\n 61.0\n \"\"\"", "canonical_solution": " if len(time_strings) < 2:\n return 0.0\n\n time_zone = pytz.timezone(timezone)\n parsed_times = [\n datetime.strptime(ts, \"%d/%m/%y %H:%M:%S.%f\")\n .replace(tzinfo=pytz.UTC)\n .astimezone(time_zone)\n for ts in time_strings\n ]\n\n differences = [\n abs((t2 - t1).total_seconds()) for t1, t2 in zip(parsed_times, parsed_times[1:])\n ]\n\n return np.mean(differences) if differences else 0.0", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_921\"\"\"\n def test_example_case(self):\n \"\"\"Test the example case.\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:32:33.123\",\n \"30/03/09 16:33:34.123\",\n ]\n self.assertAlmostEqual(f_921(time_strings, \"America/New_York\"), 61.0)\n def test_different_timezones(self):\n \"\"\"Test different timezones.\"\"\"\n time_strings = [\n \"01/04/21 12:00:00.000\",\n \"01/04/21 12:01:01.000\",\n \"01/04/21 12:02:02.000\",\n ]\n self.assertAlmostEqual(f_921(time_strings, \"Asia/Tokyo\"), 61.0)\n self.assertAlmostEqual(f_921(time_strings, \"Europe/London\"), 61.0)\n def test_varying_differences(self):\n \"\"\"Test varying differences.\"\"\"\n time_strings = [\n \"01/04/21 12:00:00.000\",\n \"01/04/21 12:01:01.000\",\n \"01/04/21 12:03:03.000\",\n ]\n self.assertAlmostEqual(f_921(time_strings, \"Asia/Tokyo\"), 91.5)\n def test_single_time_string(self):\n \"\"\"Test single time string.\"\"\"\n time_strings = [\"01/04/21 12:00:00.000\"]\n self.assertEqual(f_921(time_strings, \"Asia/Tokyo\"), 0.0)\n def test_span_across_days(self):\n \"\"\"Test span across days.\"\"\"\n time_strings = [\"31/03/21 23:59:00.000\", \"01/04/21 00:01:00.000\"]\n self.assertAlmostEqual(f_921(time_strings, \"Asia/Tokyo\"), 120.0)\n def test_out_of_order_strings(self):\n \"\"\"Test out of order strings.\"\"\"\n time_strings = [\n \"01/04/21 12:02:02.000\",\n \"01/04/21 12:00:00.000\",\n \"01/04/21 12:01:01.000\",\n ]\n self.assertAlmostEqual(f_921(time_strings, \"Asia/Tokyo\"), 91.5)", "apis": ["numpy.mean", "pytz.timezone", "datetime.datetime.strptime", "pytz.UTC"], "libs": ["numpy", "pytz", "datetime"], "doc": {"description": ["Calculates the average time difference in seconds between each consecutive pair of timestamps", "in a given list, after converting them to a specified timezone.", "Notes:", "- The function first converts each timestamp in the list to the specified timezone.", "- It then calculates the absolute time difference in seconds between each consecutive pair of timestamps.", "- If the list contains less than two timestamps, the function returns 0.0, as there are no pairs to compare.", "- If there are no time differences (e.g., in case of a single timestamp after timezone conversion), it also returns 0.0.", "- The function uses numpy's mean function to calculate the average time difference."], "note": [], "params": ["time_strings (list of str): A list of timestamp strings in the format 'dd/mm/yy HH:MM:SS.fff'.", "timezone (str): The timezone to which the timestamp strings should be converted.", "This should be a valid timezone string, e.g., 'America/New_York'."], "returns": ["float: The mean (average) time difference in seconds between each consecutive pair of timestamps.", "If there are less than two timestamps in the list, the function returns 0.0."], "reqs": ["datetime", "pytz", "numpy"], "raises": [], "example": [">>> time_strings = ['30/03/09 16:31:32.123', '30/03/09 16:32:33.123', '30/03/09 16:33:34.123']", ">>> mean_diff = f_921(time_strings, 'America/New_York')", ">>> print(mean_diff)", "61.0"]}} +{"task_id": "f_913", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_913(data_dict):\n \"\"\"\n Generates histograms for each column in the given DataFrame and checks if the value distributions\n are uniform. It prints a message for each non-uniform distribution.\n\n Parameters:\n df (pd.DataFrame): The DataFrame to be analyzed.\n\n Returns:\n List[plt.Axes]: A list of matplotlib Axes objects, each representing the histogram for a column.\n \n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> data = {'Category1': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'E', 'E'],\n ... 'Category2': ['X', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'W', 'W', 'W', 'W', 'W']}\n >>> axes = f_913(data)\n The distribution of values in column 'Category1' is not uniform.\n The distribution of values in column 'Category2' is not uniform.\n >>> [ax.get_title() for ax in axes]\n ['Category1', 'Category2']\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data_dict)\n axes_list = []\n for column in df.columns:\n counts = df[column].value_counts()\n uniform = (\n len(set(counts)) == 1\n ) # Check if all counts are the same (uniform distribution)\n\n if not uniform:\n print(f\"The distribution of values in column '{column}' is not uniform.\")\n\n ax = counts.plot(kind=\"bar\")\n ax.set_title(column)\n axes_list.append(ax)\n plt.close()\n\n return axes_list", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_913 function.\"\"\"\n def test_uniform_distribution(self):\n \"\"\"Test for uniform distribution.\"\"\"\n data = {\n \"Category1\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"],\n \"Category2\": [\"X\", \"X\", \"Y\", \"Y\", \"Z\", \"Z\"],\n }\n axes = f_913(data)\n self.assertEqual([ax.get_title() for ax in axes], [\"Category1\", \"Category2\"])\n def test_non_uniform_distribution(self):\n \"\"\"Test for non-uniform distribution.\"\"\"\n data = {\n \"Category1\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\", \"C\"],\n \"Category2\": [\"X\", \"X\", \"Y\", \"Y\", \"Z\", \"Z\", \"Z\"],\n }\n axes = f_913(data)\n self.assertEqual([ax.get_title() for ax in axes], [\"Category1\", \"Category2\"])\n def test_single_column(self):\n \"\"\"Test for single column.\"\"\"\n data = {\n \"Category1\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"],\n }\n axes = f_913(data)\n self.assertEqual([ax.get_title() for ax in axes], [\"Category1\"])\n def test_multiple_categories(self):\n \"\"\"Test for multiple categories.\"\"\"\n data = {\n \"Category1\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\", \"D\", \"D\", \"E\", \"E\"],\n \"Category2\": [\"X\", \"X\", \"Y\", \"Y\", \"Z\", \"Z\", \"W\", \"W\", \"V\", \"V\"],\n }\n axes = f_913(data)\n self.assertEqual([ax.get_title() for ax in axes], [\"Category1\", \"Category2\"])\n def test_empty_dataframe(self):\n \"\"\"Test for empty dataframe.\"\"\"\n data = {}\n axes = f_913(data)\n self.assertEqual(axes, [])", "apis": ["pandas.DataFrame", "matplotlib.pyplot.close"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Generates histograms for each column in the given DataFrame and checks if the value distributions", "are uniform. It prints a message for each non-uniform distribution."], "note": [], "params": ["df (pd.DataFrame): The DataFrame to be analyzed."], "returns": ["List[plt.Axes]: A list of matplotlib Axes objects, each representing the histogram for a column."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> data = {'Category1': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'E', 'E'],", "... 'Category2': ['X', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'W', 'W', 'W', 'W', 'W']}", ">>> axes = f_913(data)", "The distribution of values in column 'Category1' is not uniform.", "The distribution of values in column 'Category2' is not uniform.", ">>> [ax.get_title() for ax in axes]", "['Category1', 'Category2']"]}} +{"task_id": "f_419", "prompt": "from collections import Counter\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\n\n\ndef f_419(df, n_clusters=3, random_state=None, n_init=10):\n \"\"\"\n Identify duplicate points in a DataFrame, perform KMeans clustering on the unique points,\n and record the clusters.\n\n Parameters:\n df (pd.DataFrame): A DataFrame containing at least two columns 'x' and 'y' representing points.\n n_clusters (int, optional): Number of clusters for KMeans clustering. Default is 3.\n random_state (int, optional): The seed used by the random number generator for reproducibility. Default is None.\n n_init (int, optional): Number of time the k-means algorithm will be run with different centroid seeds.\n The final results will be the best output of n_init consecutive runs in terms of\n within-cluster sum of squares. Default is 10.\n\n Returns:\n tuple: A tuple containing:\n - Counter: A Counter object with the count of duplicate points.\n - pd.DataFrame: A DataFrame with an additional column 'cluster' representing cluster assignments for unique points.\n - Axes: A scatter plot of the clustered data.\n\n Requirements:\n - collections.Counter\n - sklearn.cluster.KMeans\n - matplotlib.pyplot\n\n Example:\n >>> df = pd.DataFrame({\\\n 'x': [1, 2, 2, 2, 3, 4],\\\n 'y': [1, 1, 1, 1, 3, 3]\\\n })\n >>> duplicates, df_clustered, ax = f_419(df, random_state=42)\n >>> df_clustered\n x y cluster\n 0 1 1 2\n 1 2 1 0\n 4 3 3 1\n 5 4 3 1\n >>> duplicates\n Counter({(2, 1): 3})\n \"\"\"", "canonical_solution": " # Identify duplicates\n duplicates = df[df.duplicated(subset=[\"x\", \"y\"], keep=False)]\n duplicates_counter = Counter(map(tuple, duplicates[[\"x\", \"y\"]].values))\n\n # Remove duplicates and perform KMeans clustering on unique points\n unique_df = df.drop_duplicates(subset=[\"x\", \"y\"]).copy()\n\n # Adjust n_clusters if unique data points are fewer than desired clusters\n n_clusters = min(n_clusters, len(unique_df))\n\n kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=n_init)\n unique_df[\"cluster\"] = kmeans.fit_predict(unique_df[[\"x\", \"y\"]])\n\n # Plot clustered data\n fig, ax = plt.subplots()\n scatter = ax.scatter(unique_df[\"x\"], unique_df[\"y\"], c=unique_df[\"cluster\"])\n ax.set_xlabel(\"x\")\n ax.set_ylabel(\"y\")\n ax.set_title(\"KMeans Clusters\")\n\n return duplicates_counter, unique_df, ax", "test": "import unittest\nimport pandas as pd\nfrom collections import Counter\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic functionality with duplicates\n df = pd.DataFrame({\"x\": [1, 2, 2, 2, 3, 4], \"y\": [1, 1, 1, 1, 3, 3]})\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter({(2, 1): 3}))\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n self.assertFalse(df_clustered[\"cluster\"].isna().any())\n def test_case_2(self):\n # Test functionality without duplicates\n df = pd.DataFrame({\"x\": [1, 2, 3, 4, 5, 6], \"y\": [1, 2, 3, 4, 5, 6]})\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter())\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n def test_case_3(self):\n # Test functionality with all points being duplicates\n df = pd.DataFrame({\"x\": [1, 1, 1, 1, 1, 1], \"y\": [1, 1, 1, 1, 1, 1]})\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter({(1, 1): 6}))\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n def test_case_4(self):\n # Test with specified number of clusters\n df = pd.DataFrame({\"x\": [1, 2, 3, 40, 50, 60], \"y\": [1, 2, 3, 40, 50, 60]})\n duplicates, df_clustered, ax = f_419(df, n_clusters=2, random_state=42)\n self.assertEqual(duplicates, Counter())\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n def test_case_5(self):\n # Test functionality with multiple duplicates\n df = pd.DataFrame(\n {\"x\": [1, 2, 3, 4, 5, 5, 5, 5], \"y\": [1, 2, 3, 4, 5, 5, 5, 5]}\n )\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter({(5, 5): 4}))\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n self.assertFalse(df_clustered[\"cluster\"].isna().any())\n def test_case_6(self):\n # Test with a mix of unique points and duplicates\n df = pd.DataFrame(\n {\"x\": [1, 2, 3, 3, 3, 4, 5, 6], \"y\": [1, 2, 3, 3, 3, 4, 5, 6]}\n )\n duplicates, df_clustered, ax = f_419(df, random_state=42)\n self.assertEqual(duplicates, Counter({(3, 3): 3}))\n self.assertIn(\"cluster\", df_clustered.columns)\n self.assertEqual(ax.get_title(), \"KMeans Clusters\")\n self.assertFalse(df_clustered[\"cluster\"].isna().any())\n def test_case_7(self):\n # Easily separable data\n df = pd.DataFrame(\n {\n \"x\": [1, 2, 3, 10, 11, 12, 20, 21, 22],\n \"y\": [1, 2, 3, 10, 11, 12, 20, 21, 22],\n }\n )\n # We expect 3 clusters because of the natural separation in data\n duplicates, df_clustered, _ = f_419(df, n_clusters=3, random_state=42)\n self.assertEqual(duplicates, Counter())\n # Check that all points in a specific region belong to the same cluster\n cluster_1 = df_clustered[df_clustered[\"x\"] <= 3][\"cluster\"].nunique()\n cluster_2 = df_clustered[(df_clustered[\"x\"] > 3) & (df_clustered[\"x\"] <= 12)][\n \"cluster\"\n ].nunique()\n cluster_3 = df_clustered[df_clustered[\"x\"] > 12][\"cluster\"].nunique()\n self.assertEqual(\n cluster_1, 1\n ) # All points in this region should belong to the same cluster\n self.assertEqual(\n cluster_2, 1\n ) # All points in this region should belong to the same cluster\n self.assertEqual(\n cluster_3, 1\n ) # All points in this region should belong to the same cluster\n def test_case_8(self):\n # Test effects of random state on clustering outcome\n df = pd.DataFrame(\n {\"x\": [10, 20, 20, 40, 50, 60], \"y\": [10, 20, 20, 40, 50, 60]}\n )\n _, df_clustered_1, _ = f_419(df, n_clusters=2, random_state=42)\n _, df_clustered_2, _ = f_419(df, n_clusters=2, random_state=42)\n # Clusters should be the same for the same random state\n self.assertTrue((df_clustered_1[\"cluster\"] == df_clustered_2[\"cluster\"]).all())\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "collections.Counter", "sklearn.cluster.KMeans"], "libs": ["sklearn", "collections", "matplotlib"], "doc": {"description": ["Identify duplicate points in a DataFrame, perform KMeans clustering on the unique points,", "and record the clusters."], "note": [], "params": ["df (pd.DataFrame): A DataFrame containing at least two columns 'x' and 'y' representing points.", "n_clusters (int, optional): Number of clusters for KMeans clustering. Default is 3.", "random_state (int, optional): The seed used by the random number generator for reproducibility. Default is None.", "n_init (int, optional): Number of time the k-means algorithm will be run with different centroid seeds.", "The final results will be the best output of n_init consecutive runs in terms of", "within-cluster sum of squares. Default is 10."], "returns": ["tuple: A tuple containing:", "Counter: A Counter object with the count of duplicate points.", "pd.DataFrame: A DataFrame with an additional column 'cluster' representing cluster assignments for unique points.", "Axes: A scatter plot of the clustered data."], "reqs": ["collections.Counter", "sklearn.cluster.KMeans", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({\\", "'x': [1, 2, 2, 2, 3, 4],\\", "'y': [1, 1, 1, 1, 3, 3]\\", "})", ">>> duplicates, df_clustered, ax = f_419(df, random_state=42)", ">>> df_clustered", "x y cluster", "0 1 1 2", "1 2 1 0", "4 3 3 1", "5 4 3 1", ">>> duplicates", "Counter({(2, 1): 3})"]}} +{"task_id": "f_900", "prompt": "import numpy as np\nimport random\nimport matplotlib.pyplot as plt\n\n# Constants\nLETTERS = list(\"abcdefghijklmnopqrstuvwxyz\")\nNUMBERS = list(range(1, 27))\n\n\ndef f_900(n_pairs=26):\n \"\"\"\n This function generates and displays a bar chart representing random letter-number pairs.\n Each bar corresponds to a unique pair, formed by combining a letter from 'a' to 'z' with a number\n from 1 to 26. The function randomly shuffles these pairs and assigns a random count to each.\n\n Parameters:\n - n_pairs (int, optional): The number of letter-number pairs to display in the bar chart.\n The value must be an integer between 1 and 26, inclusive. The default value is 26, which\n includes one pair for each letter in the alphabet.\n\n Returns:\n - matplotlib.container.BarContainer: This object represents the bar chart created by the function.\n Each bar in the chart is labeled with its corresponding letter-number pair (e.g., 'a:1', 'b:2').\n\n Raises:\n - ValueError: If 'n_pairs' is outside the range of 1 to 26, inclusive. This ensures that the function\n operates within the bounds of the predefined letters ('a' to 'z') and numbers (1 to 26).\n\n Requirements:\n - numpy\n - matplotlib\n - random\n\n Notes:\n - Each call to this function will likely produce a different chart because it shuffles the order\n of the pairs and assigns random counts to them.\n - The random counts assigned to each pair range from 1 to 9.\n\n Example:\n >>> ax = f_900(5)\n >>> [bar.get_label() for bar in ax]\n ['d:4', 'b:2', 'c:3', 'e:5', 'a:1']\n \"\"\"", "canonical_solution": " if n_pairs > 26 or n_pairs < 1:\n raise ValueError(\"n_pairs should be between 1 and 26\")\n\n pairs = [f\"{letter}:{number}\" for letter, number in zip(LETTERS, NUMBERS)][:n_pairs]\n random.seed(42)\n random.shuffle(pairs)\n counts = np.random.randint(1, 10, size=n_pairs)\n\n bars = plt.bar(pairs, counts)\n\n # Set label for each bar\n for bar, pair in zip(bars, pairs):\n bar.set_label(pair)\n\n plt.xlabel(\"Letter:Number Pairs\")\n plt.ylabel(\"Counts\")\n plt.title(\"Random Letter:Number Pairs Chart\")\n\n return bars", "test": "import unittest\nimport matplotlib.pyplot as plt\nfrom matplotlib.container import BarContainer\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_900.\"\"\"\n def test_return_type(self):\n \"\"\"Verify the returned type of the function.\"\"\"\n random.seed(0)\n ax = f_900(5)\n self.assertIsInstance(\n ax, BarContainer, \"The returned object is not of the expected type.\"\n )\n def test_number_of_bars(self):\n \"\"\"Verify the number of bars plotted for different `n_pairs` values.\"\"\"\n random.seed(1)\n for i in [5, 10, 20]:\n ax = f_900(i)\n self.assertEqual(\n len(ax.patches),\n i,\n f\"Expected {i} bars, but got {len(ax.patches)} bars.\",\n )\n def test_labels_and_title(self):\n \"\"\"Verify the labels and the title of the plotted bar chart.\"\"\"\n random.seed(2)\n _ = f_900(15)\n fig = plt.gcf()\n axes = fig.gca()\n self.assertEqual(\n axes.get_xlabel(), \"Letter:Number Pairs\", \"X label is incorrect.\"\n )\n self.assertEqual(axes.get_ylabel(), \"Counts\", \"Y label is incorrect.\")\n self.assertEqual(\n axes.get_title(), \"Random Letter:Number Pairs Chart\", \"Title is incorrect.\"\n )\n def test_invalid_n_pairs(self):\n \"\"\"Test the function with invalid `n_pairs` values.\"\"\"\n random.seed(3)\n with self.assertRaises(ValueError):\n f_900(27)\n with self.assertRaises(ValueError):\n f_900(0)\n def test_valid_pairs(self):\n \"\"\"Verify that the pairs generated are valid and correspond to the expected letter:number format.\"\"\"\n random.seed(4)\n ax = f_900(5)\n expected_pairs = [\"a:1\", \"b:2\", \"c:3\", \"d:4\", \"e:5\"]\n generated_pairs = [bar.get_label() for bar in ax]\n for expected_pair in expected_pairs:\n self.assertIn(\n expected_pair,\n generated_pairs,\n f\"Expected pair {expected_pair} not found in plotted pairs.\",\n )", "apis": ["numpy.random.randint", "matplotlib.pyplot.bar", "numpy.random", "random.shuffle", "random.seed", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.xlabel"], "libs": ["random", "numpy", "matplotlib"], "doc": {"description": ["This function generates and displays a bar chart representing random letter-number pairs.", "Each bar corresponds to a unique pair, formed by combining a letter from 'a' to 'z' with a number", "from 1 to 26. The function randomly shuffles these pairs and assigns a random count to each.", "Notes:", "- Each call to this function will likely produce a different chart because it shuffles the order", "of the pairs and assigns random counts to them.", "- The random counts assigned to each pair range from 1 to 9."], "note": [], "params": ["n_pairs (int, optional): The number of letter-number pairs to display in the bar chart.", "The value must be an integer between 1 and 26, inclusive. The default value is 26, which", "includes one pair for each letter in the alphabet."], "returns": ["matplotlib.container.BarContainer: This object represents the bar chart created by the function.", "Each bar in the chart is labeled with its corresponding letter-number pair (e.g., 'a:1', 'b:2')."], "reqs": ["numpy", "matplotlib", "random"], "raises": ["ValueError: If 'n_pairs' is outside the range of 1 to 26, inclusive. This ensures that the function", "operates within the bounds of the predefined letters ('a' to 'z') and numbers (1 to 26)."], "example": [">>> ax = f_900(5)", ">>> [bar.get_label() for bar in ax]", "['d:4', 'b:2', 'c:3', 'e:5', 'a:1']"]}} +{"task_id": "f_330", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_330(data, column=\"c\"):\n \"\"\"\n Remove a column from a data dictionary if it exists, and then plot the remaining data\n if it contains numeric data.\n\n Parameters:\n - data (dict): The input data dictionary.\n - column (str): Name of column to remove. Defaults to \"c\".\n\n Returns:\n - df (pd.DataFrame): The modified DataFrame after removing the specified column.\n - ax (matplotlib.axes._axes.Axes or None): The plot of the modified DataFrame if there's\n numeric data to plot, otherwise None.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> data = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}\n >>> modified_df, ax = f_330(data)\n >>> ax\n \n >>> modified_df\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n if column in df.columns:\n df = df.drop(columns=column)\n\n # If there's no numeric data, return None for the plot.\n if df.empty or not np.any(df.dtypes.apply(pd.api.types.is_numeric_dtype)):\n return df, None\n\n ax = df.plot()\n return df, ax", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Scenario: DataFrame with columns 'a', 'b', and 'c'.\n np.random.seed(0)\n data = {\n \"a\": np.random.randn(10),\n \"b\": np.random.randn(10),\n \"c\": np.random.randn(10),\n }\n df = pd.DataFrame(\n data\n )\n modified_df, ax = f_330(data) # Remove default column 'c'.\n # Assert column 'c' removal and plot data verification.\n self.assertNotIn(\"c\", modified_df.columns)\n plotted_data = [line.get_ydata() for line in ax.get_lines()]\n self.assertTrue(\n all(\n [\n np.array_equal(data, modified_df[col].values)\n for data, col in zip(plotted_data, modified_df.columns)\n ]\n )\n )\n def test_case_2(self):\n # Scenario: DataFrame with columns 'a' and 'b' (no 'c').\n np.random.seed(0)\n data = {\"a\": np.random.randn(10), \"b\": np.random.randn(10)}\n df = pd.DataFrame(data)\n modified_df, ax = f_330(data)\n # Assert that the modified DataFrame remains unchanged and plot is generated.\n self.assertEqual(list(df.columns), list(modified_df.columns))\n self.assertIsNotNone(ax)\n def test_case_3(self):\n # Scenario: Empty DataFrame\n data = {}\n df = pd.DataFrame(data)\n modified_df, ax = f_330(data)\n # Assert empty DataFrame and no plot.\n self.assertTrue(modified_df.empty)\n self.assertIsNone(ax)\n def test_case_4(self):\n # Scenario: DataFrame with single non-numeric column 'c'.\n data = {\"c\": [\"apple\", \"banana\", \"cherry\"]}\n df = pd.DataFrame(data)\n modified_df, ax = f_330(data)\n # Assert empty DataFrame after 'c' removal and no plot.\n self.assertTrue(modified_df.empty)\n self.assertIsNone(ax)\n def test_case_5(self):\n np.random.seed(0)\n # Scenario: DataFrame with columns 'a', 'b', 'c', and non-numeric column 'd'.\n data = {\n \"a\": np.random.randn(10),\n \"b\": np.random.randn(10),\n \"c\": np.random.randn(10),\n \"d\": [\n \"apple\",\n \"banana\",\n \"cherry\",\n \"date\",\n \"fig\",\n \"grape\",\n \"honeydew\",\n \"kiwi\",\n \"lime\",\n \"mango\",\n ],\n }\n df = pd.DataFrame(\n data\n )\n modified_df, ax = f_330(data)\n # Assert column 'c' removal and plot data verification excluding non-numeric column 'd'.\n self.assertNotIn(\"c\", modified_df.columns)\n plotted_data = [line.get_ydata() for line in ax.get_lines()]\n self.assertTrue(\n all(\n [\n np.array_equal(data, modified_df[col].values)\n for data, col in zip(plotted_data, modified_df.columns)\n if col != \"d\"\n ]\n )\n )\n def test_case_6(self):\n # Scenario: Remove specified column.\n np.random.seed(0)\n data = {\n \"a\": np.random.randn(10),\n \"b\": np.random.randn(10),\n }\n df = pd.DataFrame(\n data\n )\n modified_df, ax = f_330(df, column=\"a\")\n self.assertNotIn(\"a\", modified_df.columns)\n plotted_data = [line.get_ydata() for line in ax.get_lines()]\n self.assertTrue(\n all(\n [\n np.array_equal(data, modified_df[col].values)\n for data, col in zip(plotted_data, modified_df.columns)\n ]\n )\n )\n def test_case_7(self):\n # Scenario: Only non-numeric columns.\n data = {\n \"a\": [\"apple\", \"banana\"],\n \"b\": [\"cherry\", \"date\"],\n \"c\": [\"fig\", \"grape\"],\n }\n df = pd.DataFrame(\n data\n )\n modified_df, ax = f_330(data)\n self.assertNotIn(\"c\", modified_df.columns)\n pd.testing.assert_frame_equal(df[[\"a\", \"b\"]], modified_df)\n self.assertEqual(ax, None)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "numpy.any", "pandas.api"], "libs": ["numpy", "pandas"], "doc": {"description": ["Remove a column from a data dictionary if it exists, and then plot the remaining data", "if it contains numeric data."], "note": [], "params": ["data (dict): The input data dictionary.", "column (str): Name of column to remove. Defaults to \"c\"."], "returns": ["df (pd.DataFrame): The modified DataFrame after removing the specified column.", "ax (matplotlib.axes._axes.Axes or None): The plot of the modified DataFrame if there's", "numeric data to plot, otherwise None."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> data = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}", ">>> modified_df, ax = f_330(data)", ">>> ax", "", ">>> modified_df", "a b", "0 1 4", "1 2 5", "2 3 6"]}} +{"task_id": "f_932", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_932(data=None):\n \"\"\"\n Pre-process a dataset by converting it to a Pandas DataFrame,\n replacing values less than 0.5 with zeros, and\n standardizing the data using StandardScaler.\n\n Parameters:\n - data (numpy.ndarray, optional): A numpy array representing the dataset. If not provided, a random dataset\n of shape (100, 5) is generated.\n\n Returns:\n - pandas.DataFrame: The preprocessed dataset. Original values less than 0.5 are replaced with zeros, and the\n entire dataset is standardized.\n\n Requirements:\n - numpy\n - pandas\n - sklearn.preprocessing.StandardScaler\n\n Example:\n >>> np.random.seed(0)\n >>> dataset = np.random.rand(10, 5)\n >>> preprocessed_data = f_932(dataset)\n >>> preprocessed_data.head(2)\n 0 1 2 3 4\n 0 0.175481 1.062315 0.244316 -0.17039 -0.647463\n 1 0.461851 -0.978767 1.052947 1.06408 -0.647463\n \"\"\"", "canonical_solution": " if data is None:\n data = np.random.rand(100, 5)\n\n df = pd.DataFrame(data)\n df[df < 0.5] = 0\n\n scaler = StandardScaler()\n scaled_data = scaler.fit_transform(df)\n standardized_df = pd.DataFrame(scaled_data, columns=df.columns)\n\n return standardized_df", "test": "import numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_932.\"\"\"\n def test_default_dataset(self):\n \"\"\"Test the function with default dataset.\"\"\"\n result = f_932()\n self.assertIsInstance(result, pd.DataFrame)\n self.assertEqual(result.shape, (100, 5))\n def test_small_dataset(self):\n \"\"\"Test the function with a small dataset.\"\"\"\n data = np.array([[0.1, 0.9], [0.4, 0.8]])\n result = f_932(data)\n self.assertEqual(result.shape, (2, 2))\n def test_replacement(self):\n \"\"\"Test the replacement of values less than 0.5.\"\"\"\n data = np.array([[0.1, 0.9], [0.4, 0.8]])\n result = f_932(data)\n self.assertNotIn(0.1, result.values)\n self.assertNotIn(0.4, result.values)\n def test_no_replacement(self):\n \"\"\"Test no replacement for values greater than 0.5.\"\"\"\n data = np.array([[0.6, 0.9], [0.7, 0.8]])\n result = f_932(data)\n self.assertNotIn(0.6, result.values)\n self.assertNotIn(0.7, result.values)\n self.assertNotIn(0.8, result.values)\n self.assertNotIn(0.9, result.values)\n def test_standardization(self):\n \"\"\"Test the standardization of the dataset.\"\"\"\n data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n result = f_932(data)\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.isclose(result.std().mean(), 1.225, atol=0.01))\n \"\"\"Test the replacement of values less than 0.5.\"\"\"\n data = np.array([[0.1, 0.9], [0.4, 0.8]])\n result = f_932(data)\n self.assertNotIn(0.1, result.values)\n self.assertNotIn(0.4, result.values)", "apis": ["pandas.DataFrame", "numpy.random.rand", "sklearn.preprocessing.StandardScaler", "numpy.random"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Pre-process a dataset by converting it to a Pandas DataFrame,", "replacing values less than 0.5 with zeros, and", "standardizing the data using StandardScaler."], "note": [], "params": ["data (numpy.ndarray, optional): A numpy array representing the dataset. If not provided, a random dataset", "of shape (100, 5) is generated."], "returns": ["pandas.DataFrame: The preprocessed dataset. Original values less than 0.5 are replaced with zeros, and the", "entire dataset is standardized."], "reqs": ["numpy", "pandas", "sklearn.preprocessing.StandardScaler"], "raises": [], "example": [">>> np.random.seed(0)", ">>> dataset = np.random.rand(10, 5)", ">>> preprocessed_data = f_932(dataset)", ">>> preprocessed_data.head(2)", "0 1 2 3 4", "0 0.175481 1.062315 0.244316 -0.17039 -0.647463", "1 0.461851 -0.978767 1.052947 1.06408 -0.647463"]}} {"task_id": "f_864", "prompt": "import binascii\nimport hashlib\n\n\ndef f_864(input_string, verify_hash=None):\n \"\"\"\n Compute the SHA256 hash of a given input string and return its hexadecimal representation.\n Optionally, verify the computed hash against a provided hash.\n\n Parameters:\n - input_string (str): The string to be hashed.\n - verify_hash (str, optional): A hexadecimal string to be compared with the computed hash.\n\n Returns:\n - str: A hexadecimal string representing the SHA256 hash of the input string.\n - bool: True if verify_hash is provided and matches the computed hash, otherwise None.\n\n Raises:\n - TypeError: If the input is not a string or verify_hash is not a string or None.\n\n Requirements:\n - hashlib\n - binascii\n\n Example:\n >>> f_864(\"Hello, World!\")\n 'a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e'\n >>> f_864(\"Hello, World!\", \"a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e\")\n True\n \"\"\"", "canonical_solution": " if not isinstance(input_string, str):\n raise TypeError(\"Input must be a string\")\n if verify_hash is not None and not isinstance(verify_hash, str):\n raise TypeError(\"verify_hash must be a string or None\")\n\n hashed_bytes = hashlib.sha256(input_string.encode()).digest()\n hex_encoded_hash = binascii.hexlify(hashed_bytes).decode()\n\n if verify_hash is not None:\n return hex_encoded_hash == verify_hash\n\n return hex_encoded_hash", "test": "import unittest\nimport binascii\nimport hashlib\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_864.\"\"\"\n def test_string_with_numbers(self):\n \"\"\"Test that the function returns the correct hash for a string with numbers.\"\"\"\n self.assertEqual(\n f_864(\"4a4b4c\"),\n \"1a3db6ced8854274567d707b509f7486a9244be0cab89217713fce9bf09f522e\",\n )\n def test_string_with_space(self):\n \"\"\"Test that the function returns the correct hash for a string with space.\"\"\"\n self.assertEqual(\n f_864(\"Open AI\"),\n \"dd7503942d7be003d6faaa93d9951126fde3bdd4f3484404927e79585682878a\",\n )\n def test_empty_string(self):\n \"\"\"Test that the function returns the correct hash for an empty string.\"\"\"\n self.assertEqual(\n f_864(\"\"),\n \"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855\",\n )\n def test_string_numbers(self):\n \"\"\"Test that the function returns the correct hash for a string numbers.\"\"\"\n self.assertEqual(\n f_864(\"123456\"),\n \"8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3ca12020c923adc6c92\",\n )\n def test_long_string(self):\n \"\"\"Test that the function returns the correct hash for a long string.\"\"\"\n self.assertEqual(\n f_864(\"abcdefghijklmnopqrstuvwxyz\"),\n \"71c480df93d6ae2f1efad1447c66c9525e316218cf51fc8d9ed832f2daf18b73\",\n )\n def test_verify_hash_correct(self):\n \"\"\"Test that the function returns True when verify_hash is correct.\"\"\"\n self.assertTrue(\n f_864(\n \"Hello, World!\",\n \"dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f\",\n )\n )\n def test_verify_hash_incorrect(self):\n \"\"\"Test that the function returns False when verify_hash is incorrect.\"\"\"\n self.assertFalse(f_864(\"Hello, World!\", \"incorrect_hash\"))\n def test_verify_hash_none(self):\n \"\"\"Test that the function returns None when verify_hash is None.\"\"\"\n self.assertEqual(\n f_864(\"Hello, World!\"),\n \"dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f\",\n )\n def test_input_string_not_string(self):\n \"\"\"Test that the function raises an error when the input is not a string.\"\"\"\n with self.assertRaises(TypeError):\n f_864(123)\n def test_verify_hash_not_string_or_none(self):\n \"\"\"Test that the function raises an error when verify_hash is not a string or None.\"\"\"\n with self.assertRaises(TypeError):\n f_864(\"Hello, World!\", 123)", "apis": ["hashlib.sha256", "binascii.hexlify"], "libs": ["binascii", "hashlib"], "doc": {"description": ["Compute the SHA256 hash of a given input string and return its hexadecimal representation.", "Optionally, verify the computed hash against a provided hash."], "note": [], "params": ["input_string (str): The string to be hashed.", "verify_hash (str, optional): A hexadecimal string to be compared with the computed hash."], "returns": ["str: A hexadecimal string representing the SHA256 hash of the input string.", "bool: True if verify_hash is provided and matches the computed hash, otherwise None."], "reqs": ["hashlib", "binascii"], "raises": ["TypeError: If the input is not a string or verify_hash is not a string or None."], "example": [">>> f_864(\"Hello, World!\")", "'a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e'", ">>> f_864(\"Hello, World!\", \"a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e\")", "True"]}} -{"task_id": "f_344", "prompt": "import numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_344(P, T):\n \"\"\"\n Calculate the product of a matrix 'P' and a 3D tensor 'T' using numpy and visualize the results as a heatmap.\n Note: This function only accepts numpy matrices/arrays.\n\n Parameters:\n - P (numpy.ndarray): Input matrix of shape (M, 3), where M can be any positive integer.\n - T (numpy.ndarray): Input tensor of shape (3, 3, 3).\n\n Returns:\n - numpy.ndarray: Resultant product after matrix-tensor multiplication.\n - matplotlib.axes.Axes: Axes object displaying the heatmap of the 2D result.\n\n Requirements:\n - numpy\n - seaborn\n\n Example:\n >>> np.random.seed(0)\n >>> P = np.array([[6, 2, 7], [1, 1, 8]])\n >>> T = np.random.rand(3, 3, 3)\n >>> product, heatmap = f_344(P, T)\n >>> product\n array([[[ 9.50686132, 11.96467131, 11.52469849],\n [ 9.99949817, 7.62347761, 9.48114103],\n [ 3.62770285, 9.87052195, 8.45068927]],\n \n [[ 7.15750903, 8.46701159, 8.96060503],\n [ 7.50619626, 5.04108634, 6.96116358],\n [ 1.47091192, 6.03135957, 2.94310891]]])\n >>> type(heatmap)\n \n \"\"\"", "canonical_solution": " if not (isinstance(P, np.ndarray) and isinstance(T, np.ndarray)):\n raise TypeError(\"Expected inputs to be numpy arrays\")\n\n result = np.tensordot(P, T, axes=[1, 0])\n # Sum along the last dimension to get a 2D matrix\n result_2D = np.sum(result, axis=-1)\n heatmap = sns.heatmap(result_2D)\n return result, heatmap", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n self.test_P = np.array([[6, 2, 7], [1, 1, 8]])\n self.test_P_zeros = np.zeros((2, 3))\n self.test_T = np.array(\n [\n [[1, 2, 3], [4, 5, 6], [7, 8, 9]],\n [[2, 3, 4], [5, 6, 7], [8, 9, 10]],\n [[3, 4, 5], [6, 7, 8], [9, 10, 11]],\n ]\n )\n def test_case_1(self):\n # Test return types\n product, heatmap = f_344(self.test_P, self.test_T)\n self.assertIsInstance(product, np.ndarray)\n self.assertIsInstance(heatmap, plt.Axes)\n def test_case_2(self):\n # Test output correctness\n product, _ = f_344(self.test_P, self.test_T)\n expected_product = np.tensordot(self.test_P, self.test_T, axes=[1, 0])\n self.assertTrue(np.allclose(product, expected_product))\n def test_case_3(self):\n # Test output correctness with zeros\n product, _ = f_344(self.test_P_zeros, self.test_T)\n self.assertTrue(np.all(product == 0))\n def test_case_4(self):\n # Test return shape\n product, _ = f_344(self.test_P, self.test_T)\n expected_shape = (2, 3, 3)\n self.assertEqual(product.shape, expected_shape, \"Output shape is incorrect\")\n def test_case_5(self):\n # Test handling invalid input types\n with self.assertRaises(TypeError):\n f_344([1, 2], [2, 1])\n def test_case_6(self):\n # Test handling invalid shape\n P = np.array([[1, 2], [3, 4]])\n T = np.random.rand(3, 3, 3)\n with self.assertRaises(ValueError):\n f_344(P, T)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.ndarray", "numpy.sum", "numpy.tensordot", "seaborn.heatmap"], "libs": ["numpy", "seaborn"], "doc": {"description": ["Calculate the product of a matrix 'P' and a 3D tensor 'T' using numpy and visualize the results as a heatmap."], "note": ["This function only accepts numpy matrices/arrays."], "params": ["P (numpy.ndarray): Input matrix of shape (M, 3), where M can be any positive integer.", "T (numpy.ndarray): Input tensor of shape (3, 3, 3)."], "returns": ["numpy.ndarray: Resultant product after matrix-tensor multiplication.", "matplotlib.axes.Axes: Axes object displaying the heatmap of the 2D result."], "reqs": ["numpy", "seaborn"], "raises": [], "example": [">>> np.random.seed(0)", ">>> P = np.array([[6, 2, 7], [1, 1, 8]])", ">>> T = np.random.rand(3, 3, 3)", ">>> product, heatmap = f_344(P, T)", ">>> product", "array([[[ 9.50686132, 11.96467131, 11.52469849],", "[ 9.99949817, 7.62347761, 9.48114103],", "[ 3.62770285, 9.87052195, 8.45068927]],", "", "[[ 7.15750903, 8.46701159, 8.96060503],", "[ 7.50619626, 5.04108634, 6.96116358],", "[ 1.47091192, 6.03135957, 2.94310891]]])", ">>> type(heatmap)", ""]}} -{"task_id": "f_746", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_746(d, keys=['x', 'y', 'z']):\n \"\"\"\n Plot values from a list of dictionaries based on specified keys and return the plot as a Matplotlib Axes object.\n \n Parameters:\n d (list): A list of dictionaries containing numerical data.\n keys (list, optional): A list of string keys to plot. Defaults to ['x', 'y', 'z'].\n\n Returns:\n Matplotlib Axes object: The plot showing the values of specified keys from the input list of dictionaries.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n >>> ax = f_746(data)\n >>> type(ax)\n \n\n >>> ax = f_746(data, keys=['x', 'y'])\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " # Convert the list of dictionaries to a DataFrame\n df = pd.DataFrame(d)\n\n # Initialize a plot\n fig, ax = plt.subplots()\n \n # Plot the values for the specified keys\n plotted_keys = []\n for key in keys:\n if key in df.columns:\n ax.plot(df[key], label=key)\n plotted_keys.append(key)\n \n # Add a legend if there are any lines plotted\n if plotted_keys:\n ax.legend()\n \n # Return the Axes object\n return ax", "test": "import unittest\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \n def test_basic_input(self):\n data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n ax = f_746(data)\n self.assertIsInstance(ax, Axes)\n self.assertEqual(set([text.get_text() for text in ax.legend_.texts]), {'x', 'y', 'z'})\n self.assertEqual(len(ax.lines), 3)\n def test_missing_keys_in_data(self):\n data = [{'x': 1, 'y': 10}, {'y': 15, 'z': 6}, {'x': 2, 'z': 7}]\n ax = f_746(data)\n self.assertIsInstance(ax, Axes)\n self.assertEqual(set([text.get_text() for text in ax.legend_.texts]), {'x', 'y', 'z'})\n self.assertEqual(len(ax.lines), 3)\n def test_custom_keys(self):\n data = [{'a': 1, 'b': 10}, {'b': 15, 'c': 6}, {'a': 2, 'c': 7}]\n ax = f_746(data, keys=['a', 'b', 'c'])\n self.assertIsInstance(ax, Axes)\n self.assertEqual(set([text.get_text() for text in ax.legend_.texts]), {'a', 'b', 'c'})\n self.assertEqual(len(ax.lines), 3)\n def test_empty_data_list(self):\n data = []\n ax = f_746(data)\n self.assertIsInstance(ax, Axes)\n self.assertEqual(len(ax.lines), 0)\n self.assertIsNone(ax.legend_)\n def test_single_key_data(self):\n data = [{'x': 1}, {'x': 2}, {'x': 3}]\n ax = f_746(data)\n self.assertIsInstance(ax, Axes)\n self.assertEqual(set([text.get_text() for text in ax.legend_.texts]), {'x'})\n self.assertEqual(len(ax.lines), 1)", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Plot values from a list of dictionaries based on specified keys and return the plot as a Matplotlib Axes object.", ">>> ax = f_746(data, keys=['x', 'y'])", ">>> type(ax)", ""], "note": [], "params": ["d (list): A list of dictionaries containing numerical data.", "keys (list, optional): A list of string keys to plot. Defaults to ['x', 'y', 'z']."], "returns": ["Matplotlib Axes object: The plot showing the values of specified keys from the input list of dictionaries."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]", ">>> ax = f_746(data)", ">>> type(ax)", ""]}} +{"task_id": "f_344", "prompt": "import numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_344(P, T):\n \"\"\"\n Calculate the product of a matrix 'P' and a 3D tensor 'T' using numpy and visualize the results as a heatmap.\n Note: This function only accepts numpy matrices/arrays.\n\n Parameters:\n - P (numpy.ndarray): Input matrix of shape (M, 3), where M can be any positive integer.\n - T (numpy.ndarray): Input tensor of shape (3, 3, 3).\n\n Returns:\n - numpy.ndarray: Resultant product after matrix-tensor multiplication.\n - matplotlib.axes.Axes: Axes object displaying the heatmap of the 2D result.\n\n Requirements:\n - numpy\n - seaborn\n\n Example:\n >>> np.random.seed(0)\n >>> P = np.array([[6, 2, 7], [1, 1, 8]])\n >>> T = np.random.rand(3, 3, 3)\n >>> product, heatmap = f_344(P, T)\n >>> product\n array([[[ 9.50686132, 11.96467131, 11.52469849],\n [ 9.99949817, 7.62347761, 9.48114103],\n [ 3.62770285, 9.87052195, 8.45068927]],\n \n [[ 7.15750903, 8.46701159, 8.96060503],\n [ 7.50619626, 5.04108634, 6.96116358],\n [ 1.47091192, 6.03135957, 2.94310891]]])\n >>> type(heatmap)\n \n \"\"\"", "canonical_solution": " if not (isinstance(P, np.ndarray) and isinstance(T, np.ndarray)):\n raise TypeError(\"Expected inputs to be numpy arrays\")\n\n result = np.tensordot(P, T, axes=[1, 0])\n # Sum along the last dimension to get a 2D matrix\n result_2D = np.sum(result, axis=-1)\n heatmap = sns.heatmap(result_2D)\n return result, heatmap", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n self.test_P = np.array([[6, 2, 7], [1, 1, 8]])\n self.test_P_zeros = np.zeros((2, 3))\n self.test_T = np.array(\n [\n [[1, 2, 3], [4, 5, 6], [7, 8, 9]],\n [[2, 3, 4], [5, 6, 7], [8, 9, 10]],\n [[3, 4, 5], [6, 7, 8], [9, 10, 11]],\n ]\n )\n def test_case_1(self):\n # Test return types\n product, heatmap = f_344(self.test_P, self.test_T)\n self.assertIsInstance(product, np.ndarray)\n self.assertIsInstance(heatmap, plt.Axes)\n def test_case_2(self):\n # Test output correctness\n product, _ = f_344(self.test_P, self.test_T)\n expected_product = np.tensordot(self.test_P, self.test_T, axes=[1, 0])\n self.assertTrue(np.allclose(product, expected_product))\n def test_case_3(self):\n # Test output correctness with zeros\n product, _ = f_344(self.test_P_zeros, self.test_T)\n self.assertTrue(np.all(product == 0))\n def test_case_4(self):\n # Test return shape\n product, _ = f_344(self.test_P, self.test_T)\n expected_shape = (2, 3, 3)\n self.assertEqual(product.shape, expected_shape, \"Output shape is incorrect\")\n def test_case_5(self):\n # Test handling invalid input types\n with self.assertRaises(TypeError):\n f_344([1, 2], [2, 1])\n def test_case_6(self):\n # Test handling invalid shape\n P = np.array([[1, 2], [3, 4]])\n T = np.random.rand(3, 3, 3)\n with self.assertRaises(ValueError):\n f_344(P, T)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["seaborn.heatmap", "numpy.sum", "numpy.ndarray", "numpy.tensordot"], "libs": ["seaborn", "numpy"], "doc": {"description": ["Calculate the product of a matrix 'P' and a 3D tensor 'T' using numpy and visualize the results as a heatmap."], "note": ["This function only accepts numpy matrices/arrays."], "params": ["P (numpy.ndarray): Input matrix of shape (M, 3), where M can be any positive integer.", "T (numpy.ndarray): Input tensor of shape (3, 3, 3)."], "returns": ["numpy.ndarray: Resultant product after matrix-tensor multiplication.", "matplotlib.axes.Axes: Axes object displaying the heatmap of the 2D result."], "reqs": ["numpy", "seaborn"], "raises": [], "example": [">>> np.random.seed(0)", ">>> P = np.array([[6, 2, 7], [1, 1, 8]])", ">>> T = np.random.rand(3, 3, 3)", ">>> product, heatmap = f_344(P, T)", ">>> product", "array([[[ 9.50686132, 11.96467131, 11.52469849],", "[ 9.99949817, 7.62347761, 9.48114103],", "[ 3.62770285, 9.87052195, 8.45068927]],", "", "[[ 7.15750903, 8.46701159, 8.96060503],", "[ 7.50619626, 5.04108634, 6.96116358],", "[ 1.47091192, 6.03135957, 2.94310891]]])", ">>> type(heatmap)", ""]}} +{"task_id": "f_746", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_746(d, keys=['x', 'y', 'z']):\n \"\"\"\n Plot values from a list of dictionaries based on specified keys and return the plot as a Matplotlib Axes object.\n \n Parameters:\n d (list): A list of dictionaries containing numerical data.\n keys (list, optional): A list of string keys to plot. Defaults to ['x', 'y', 'z'].\n\n Returns:\n Matplotlib Axes object: The plot showing the values of specified keys from the input list of dictionaries.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n >>> ax = f_746(data)\n >>> type(ax)\n \n\n >>> ax = f_746(data, keys=['x', 'y'])\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " # Convert the list of dictionaries to a DataFrame\n df = pd.DataFrame(d)\n\n # Initialize a plot\n fig, ax = plt.subplots()\n \n # Plot the values for the specified keys\n plotted_keys = []\n for key in keys:\n if key in df.columns:\n ax.plot(df[key], label=key)\n plotted_keys.append(key)\n \n # Add a legend if there are any lines plotted\n if plotted_keys:\n ax.legend()\n \n # Return the Axes object\n return ax", "test": "import unittest\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \n def test_basic_input(self):\n data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n ax = f_746(data)\n self.assertIsInstance(ax, Axes)\n self.assertEqual(set([text.get_text() for text in ax.legend_.texts]), {'x', 'y', 'z'})\n self.assertEqual(len(ax.lines), 3)\n def test_missing_keys_in_data(self):\n data = [{'x': 1, 'y': 10}, {'y': 15, 'z': 6}, {'x': 2, 'z': 7}]\n ax = f_746(data)\n self.assertIsInstance(ax, Axes)\n self.assertEqual(set([text.get_text() for text in ax.legend_.texts]), {'x', 'y', 'z'})\n self.assertEqual(len(ax.lines), 3)\n def test_custom_keys(self):\n data = [{'a': 1, 'b': 10}, {'b': 15, 'c': 6}, {'a': 2, 'c': 7}]\n ax = f_746(data, keys=['a', 'b', 'c'])\n self.assertIsInstance(ax, Axes)\n self.assertEqual(set([text.get_text() for text in ax.legend_.texts]), {'a', 'b', 'c'})\n self.assertEqual(len(ax.lines), 3)\n def test_empty_data_list(self):\n data = []\n ax = f_746(data)\n self.assertIsInstance(ax, Axes)\n self.assertEqual(len(ax.lines), 0)\n self.assertIsNone(ax.legend_)\n def test_single_key_data(self):\n data = [{'x': 1}, {'x': 2}, {'x': 3}]\n ax = f_746(data)\n self.assertIsInstance(ax, Axes)\n self.assertEqual(set([text.get_text() for text in ax.legend_.texts]), {'x'})\n self.assertEqual(len(ax.lines), 1)", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Plot values from a list of dictionaries based on specified keys and return the plot as a Matplotlib Axes object.", ">>> ax = f_746(data, keys=['x', 'y'])", ">>> type(ax)", ""], "note": [], "params": ["d (list): A list of dictionaries containing numerical data.", "keys (list, optional): A list of string keys to plot. Defaults to ['x', 'y', 'z']."], "returns": ["Matplotlib Axes object: The plot showing the values of specified keys from the input list of dictionaries."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]", ">>> ax = f_746(data)", ">>> type(ax)", ""]}} {"task_id": "f_745", "prompt": "import pandas as pd\nfrom sklearn.linear_model import LinearRegression\n\ndef f_745(d, target='z'):\n \"\"\"\n Perform linear regression to \"x,\" \"y,\" against \"z\" from a list of dictionaries \"d.\"\n\n Parameters:\n d (list): A list of dictionaries.\n target (str): The target variable for the regression.\n\n Returns:\n LinearRegression: A LinearRegression model.\n\n Requirements:\n - pandas\n - sklearn.linear_model.LinearRegression\n\n Examples:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n >>> model = f_745(data)\n >>> isinstance(model, LinearRegression)\n True\n\n >>> data = [{'x': 4, 'y': 20, 'z': 10}, {'x': 5, 'y': 25, 'z': 15}, {'x': 6, 'y': 5, 'z': 20}]\n >>> model = f_745(data, target='y')\n >>> isinstance(model, LinearRegression)\n True\n \"\"\"", "canonical_solution": " df = pd.DataFrame(d)\n predictors = [k for k in df.columns if k != target]\n\n X = df[predictors]\n y = df[target]\n\n model = LinearRegression().fit(X, y)\n\n return model", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_basic_regression(self):\n data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n model = f_745(data)\n self.assertIsInstance(model, LinearRegression)\n self.assertEqual(len(model.coef_), 2)\n def test_negative_values(self):\n data = [{'x': -1, 'y': -10, 'z': -5}, {'x': -3, 'y': -15, 'z': -6}, {'x': -2, 'y': -1, 'z': -7}]\n model = f_745(data)\n self.assertIsInstance(model, LinearRegression)\n self.assertEqual(len(model.coef_), 2)\n \n def test_zero_values(self):\n data = [{'x': 0, 'y': 0, 'z': 0}, {'x': 0, 'y': 0, 'z': 0}, {'x': 0, 'y': 0, 'z': 0}]\n model = f_745(data)\n self.assertIsInstance(model, LinearRegression)\n self.assertEqual(len(model.coef_), 2)\n \n def test_different_target(self):\n data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n model = f_745(data, target='y')\n self.assertIsInstance(model, LinearRegression)\n self.assertEqual(len(model.coef_), 2)\n \n def test_single_predictor(self):\n data = [{'x': 1, 'z': 5}, {'x': 3, 'z': 6}, {'x': 2, 'z': 7}]\n model = f_745(data, target='z')\n self.assertIsInstance(model, LinearRegression)\n self.assertEqual(len(model.coef_), 1)", "apis": ["pandas.DataFrame", "sklearn.linear_model.LinearRegression"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Perform linear regression to \"x,\" \"y,\" against \"z\" from a list of dictionaries \"d.\"", ">>> data = [{'x': 4, 'y': 20, 'z': 10}, {'x': 5, 'y': 25, 'z': 15}, {'x': 6, 'y': 5, 'z': 20}]", ">>> model = f_745(data, target='y')", ">>> isinstance(model, LinearRegression)", "True"], "note": [], "params": ["d (list): A list of dictionaries.", "target (str): The target variable for the regression."], "returns": ["LinearRegression: A LinearRegression model."], "reqs": ["pandas", "sklearn.linear_model.LinearRegression"], "raises": [], "example": ["Examples:", ">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]", ">>> model = f_745(data)", ">>> isinstance(model, LinearRegression)", "True"]}} -{"task_id": "f_545", "prompt": "import pandas as pd\nimport numpy as np\n\ndef f_545(df, col):\n \"\"\"\n Process a Pandas DataFrame by removing a specific column and adding a 'IsEvenIndex' column.\n The 'IsEvenIndex' column is a boolean flag indicating if the index of each row is even.\n \n Parameters:\n - df (pd.DataFrame): The pandas DataFrame to process.\n - col (str): The column to remove.\n\n Returns:\n - df (pd.DataFrame): The processed pandas DataFrame with the specified column removed and a new 'IsEvenIndex' column added.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.randint(0,100,size=(5, 4)), columns=list('ABCD'))\n >>> df = f_545(df, 'C')\n >>> print(df)\n A B D IsEvenIndex\n 0 51 92 71 True\n 1 60 20 86 False\n 2 74 74 99 True\n 3 23 2 52 False\n 4 1 87 37 True\n \"\"\"", "canonical_solution": " # Remove specified column using pandas\n updated_df = pd.DataFrame(df).drop(col, axis=1)\n \n # Add a new column 'IsEvenIndex' using numpy to determine if index is even\n # The np.arange(len(updated_df)) creates an array of indexes, % 2 == 0 checks if they are even\n updated_df['IsEvenIndex'] = np.arange(len(updated_df)) % 2 == 0\n \n return updated_df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'A')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('A' in df.columns)\n def test_case_2(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'B')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('B' in df.columns)\n def test_case_3(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'C')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('C' in df.columns)\n def test_case_4(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'D')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('D' in df.columns)\n def test_case_5(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'A')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('A' in df.columns)", "apis": ["numpy.arange", "pandas.DataFrame"], "libs": ["numpy", "pandas"], "doc": {"description": ["Process a Pandas DataFrame by removing a specific column and adding a 'IsEvenIndex' column.", "The 'IsEvenIndex' column is a boolean flag indicating if the index of each row is even."], "note": [], "params": ["df (pd.DataFrame): The pandas DataFrame to process.", "col (str): The column to remove."], "returns": ["df (pd.DataFrame): The processed pandas DataFrame with the specified column removed and a new 'IsEvenIndex' column added."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.randint(0,100,size=(5, 4)), columns=list('ABCD'))", ">>> df = f_545(df, 'C')", ">>> print(df)", "A B D IsEvenIndex", "0 51 92 71 True", "1 60 20 86 False", "2 74 74 99 True", "3 23 2 52 False", "4 1 87 37 True"]}} -{"task_id": "f_385", "prompt": "import os\nimport pandas as pd\nimport re\n\n\ndef f_385(file_path: str) -> pd.DataFrame:\n \"\"\"\n Parse a log file to extract log entries into a DataFrame.\n\n This function reads a log file line by line. The log file is assumed to follow this format\n for each entry: YYYY-MM-DD HH:MM:SS.ssssss - LEVEL - Message\n The function matches each line against a predefined regular expression to extract timestamp,\n log level, and message, ignoring lines where there is no match. It then aggregates the matched\n and extracted data into a pandas DataFrame with columns: 'Timestamp', 'Level', and 'Message'.\n If the logs are empty or there is no extracted data, this function returns an otherwise empty\n DataFrame containing the same expected columns.\n\n Parameters:\n - file_path (str): The path to the log file to be parsed.\n\n Returns:\n - pd.DataFrame: A DataFrame with columns 'Timestamp', 'Level', and 'Message'.\n\n Requirements:\n - re\n - os\n - pandas\n\n Example:\n Given a log file with content:\n ```\n 2023-01-01 12:00:00.000000 - INFO - Application started\n 2023-01-01 12:01:00.000000 - ERROR - Failed to connect to database\n ```\n >>> df = f_385(\"path_to_log_file.txt\")\n >>> type(df)\n \n >>> df.iloc[0]\n Timestamp 2023-01-01 12:00:00.000000\n Level INFO\n Message Application started\n Name: 0, dtype: object\n \"\"\"", "canonical_solution": " LOG_REGEX = r\"(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}) - (\\w+) - (.+)$\"\n\n if not os.path.exists(file_path):\n raise FileNotFoundError(f\"The file {file_path} does not exist.\")\n\n logs = []\n with open(file_path, \"r\") as f:\n for line in f:\n match = re.match(LOG_REGEX, line)\n if match:\n timestamp, level, message = match.groups()\n logs.append([timestamp, level, message])\n\n df = pd.DataFrame(logs, columns=[\"Timestamp\", \"Level\", \"Message\"])\n\n if df.empty:\n df = pd.DataFrame(columns=[\"Timestamp\", \"Level\", \"Message\"])\n\n return df", "test": "import unittest\nimport tempfile\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n def tearDown(self):\n self.temp_dir.cleanup()\n def _create_temp_log_file(self, file_name: str, content: str):\n \"\"\"Helper function to create a temporary log file.\"\"\"\n path = os.path.join(self.temp_dir.name, file_name)\n with open(path, \"w\") as f:\n f.write(content)\n return path\n def test_case_1(self):\n # Test log file with mixed levels\n content = (\n \"2023-01-01 12:00:00.000000 - INFO - Application started\\n\"\n \"2023-01-01 12:01:00.000000 - ERROR - Failed to connect to database\\n\"\n )\n log_file_path = self._create_temp_log_file(\"log1.txt\", content)\n df = f_385(log_file_path)\n self.assertEqual(len(df), 2)\n self.assertEqual(df.iloc[0][\"Level\"], \"INFO\")\n self.assertEqual(df.iloc[1][\"Level\"], \"ERROR\")\n def test_case_2(self):\n # Test case for an empty log file\n log_file_path = self._create_temp_log_file(\"log2.txt\", \"\")\n df = f_385(log_file_path)\n self.assertTrue(df.empty)\n def test_case_3(self):\n # Log file with lines that do not match the expected format\n content = \"This is not a valid log entry\\n2023-01-02 13:00:00.000000 - WARNING - Low disk space\\n\"\n log_file_path = self._create_temp_log_file(\"log3.txt\", content)\n df = f_385(log_file_path)\n self.assertEqual(len(df), 1)\n self.assertEqual(df.iloc[0][\"Level\"], \"WARNING\")\n def test_caes_4(self):\n # Test case to ensure FileNotFoundError is raised when log file does not exist\n with self.assertRaises(FileNotFoundError):\n f_385(\"/path/to/nonexistent/file.txt\")\n def test_case_5(self):\n # Log file with some entries having minor formatting issues\n content = (\n \"2023-01-03 14:00:00.000000 - DEBUG - Debugging info included\\n\"\n \"2023-01-03 Not a valid entry\\n\"\n \"WARNING - This log entry is missing its timestamp\\n\"\n \"2023-01-04 15:00:00.000000 - INFO - System update completed\\n\"\n \"Some random text not conforming to the log format\\n\"\n \"2023-01-04 16:00:00.000000 - ERROR - Error in processing\\n\"\n )\n log_file_path = self._create_temp_log_file(\"log5.txt\", content)\n df = f_385(log_file_path)\n self.assertEqual(len(df), 3)\n self.assertEqual(df.iloc[0][\"Level\"], \"DEBUG\")\n self.assertEqual(df.iloc[1][\"Level\"], \"INFO\")\n self.assertEqual(df.iloc[2][\"Level\"], \"ERROR\")\n def test_case_6(self):\n # Log file with multi-line entries\n content = (\n \"2023-02-01 10:00:00.000000 - INFO - Application start successful\\n\"\n \"2023-02-01 10:05:00.000000 - ERROR - Exception occurred:\\n\"\n \"Traceback (most recent call last):\\n\"\n ' File \"\", line 1, in \\n'\n \"ZeroDivisionError: division by zero\\n\"\n \"2023-02-01 10:10:00.000000 - INFO - Recovery attempt initiated\\n\"\n )\n log_file_path = self._create_temp_log_file(\"log6.txt\", content)\n df = f_385(log_file_path)\n self.assertEqual(len(df), 3)\n self.assertEqual(df.iloc[0][\"Level\"], \"INFO\")\n self.assertEqual(df.iloc[1][\"Level\"], \"ERROR\")\n self.assertEqual(df.iloc[2][\"Level\"], \"INFO\")\n self.assertTrue(\"Exception occurred:\" in df.iloc[1][\"Message\"])\n self.assertFalse(\n \"Traceback\" in df.iloc[1][\"Message\"]\n or \"ZeroDivisionError\" in df.iloc[1][\"Message\"]\n )", "apis": ["os.path.exists", "pandas.DataFrame", "os.path", "re.match"], "libs": ["pandas", "re", "os"], "doc": {"description": ["Parse a log file to extract log entries into a DataFrame.", "This function reads a log file line by line. The log file is assumed to follow this format", "for each entry: YYYY-MM-DD HH:MM:SS.ssssss - LEVEL - Message", "The function matches each line against a predefined regular expression to extract timestamp,", "log level, and message, ignoring lines where there is no match. It then aggregates the matched", "and extracted data into a pandas DataFrame with columns: 'Timestamp', 'Level', and 'Message'.", "If the logs are empty or there is no extracted data, this function returns an otherwise empty", "DataFrame containing the same expected columns."], "note": [], "params": ["file_path (str): The path to the log file to be parsed."], "returns": ["pd.DataFrame: A DataFrame with columns 'Timestamp', 'Level', and 'Message'."], "reqs": ["re", "os", "pandas"], "raises": [], "example": ["Given a log file with content:", "```", "2023-01-01 12:00:00.000000 - INFO - Application started", "2023-01-01 12:01:00.000000 - ERROR - Failed to connect to database", "```", ">>> df = f_385(\"path_to_log_file.txt\")", ">>> type(df)", "", ">>> df.iloc[0]", "Timestamp 2023-01-01 12:00:00.000000", "Level INFO", "Message Application started", "Name: 0, dtype: object"]}} -{"task_id": "f_870", "prompt": "import binascii\nimport urllib.parse\n\n\ndef f_870(url):\n \"\"\"\n Decode a hexadecimal string from the 'q' query parameter of a URL.\n\n This function extracts the 'q' query parameter from the given URL,\n assumes it is a hexadecimal string, and decodes it into a UTF-8 string.\n If the hexadecimal string is invalid or cannot be decoded into a valid UTF-8 string, None is returned.\n\n Parameters:\n url (str): The URL to extract the query parameter from.\n\n Returns:\n str or None: The decoded string if the 'q' parameter exists and is a valid hexadecimal, otherwise None.\n\n Requirements:\n - binascii\n - urllib.parse\n \n Example:\n >>> f_870('https://www.example.com?q=4a4b4c')\n 'JKL'\n \"\"\"", "canonical_solution": " try:\n parsed_url = urllib.parse.urlparse(url)\n query = urllib.parse.parse_qs(parsed_url.query).get(\"q\", [None])[0]\n return binascii.unhexlify(query).decode(\"utf-8\") if query else None\n except (binascii.Error, UnicodeDecodeError):\n return None", "test": "import unittest\nimport binascii\nimport urllib.parse\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_870.\"\"\"\n def test_valid_hex_string(self):\n \"\"\"Test with a valid hex string in query parameter.\"\"\"\n url = \"https://www.example.com?q=4a4b4c\"\n self.assertEqual(f_870(url), \"JKL\")\n def test_no_query_parameter(self):\n \"\"\"Test with no query parameter.\"\"\"\n url = \"https://www.example.com\"\n self.assertIsNone(f_870(url))\n def test_invalid_hex_string(self):\n \"\"\"Test with an invalid hex string in query parameter.\"\"\"\n url = \"https://www.example.com?q=4a4b4c4d4\"\n self.assertIsNone(\n f_870(url)\n ) # Updated to assertIsNone as the function now handles the exception\n def test_valid_hex_non_utf8(self):\n \"\"\"Test with a valid hex string that is not valid UTF-8.\"\"\"\n url = \"https://www.example.com?q=80\"\n self.assertIsNone(\n f_870(url)\n ) # Updated to assertIsNone due to the handling of UnicodeDecodeError\n def test_multiple_query_parameters(self):\n \"\"\"Test with multiple query parameters.\"\"\"\n url = \"https://www.example.com?a=123&q=4a4b4c&b=456\"\n self.assertEqual(f_870(url), \"JKL\")", "apis": ["binascii.Error", "binascii.unhexlify", "urllib.parse", "urllib.parse.urlparse", "urllib.parse.parse_qs"], "libs": ["urllib", "binascii"], "doc": {"description": ["Decode a hexadecimal string from the 'q' query parameter of a URL.", "This function extracts the 'q' query parameter from the given URL,", "assumes it is a hexadecimal string, and decodes it into a UTF-8 string.", "If the hexadecimal string is invalid or cannot be decoded into a valid UTF-8 string, None is returned."], "note": [], "params": ["url (str): The URL to extract the query parameter from."], "returns": ["str or None: The decoded string if the 'q' parameter exists and is a valid hexadecimal, otherwise None."], "reqs": ["binascii", "urllib.parse"], "raises": [], "example": [">>> f_870('https://www.example.com?q=4a4b4c')", "'JKL'"]}} -{"task_id": "f_827", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\n\n\ndef f_827(df, x_column, y_column):\n \"\"\"\n Draws a scatter plot for the specified columns from a pandas DataFrame and fits a linear regression model to the data.\n\n Parameters:\n df (DataFrame): The input pandas DataFrame.\n x_column (str): The column name for the x-axis. Data contained in column must be numeric.\n y_column (str): The column name for the y-axis. Data contained in column must be numeric.\n\n Returns:\n matplotlib.axes._subplots.AxesSubplot: The Axes object containing the scatter plot and the linear regression line.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib\n - sklearn\n\n Notes:\n - After plotting the scatterplot, this function overlays the predicted regression line on top in red on the same Axes.\n\n Example:\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})\n >>> ax = f_827(df, 'A', 'B')\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " X = df[x_column].values.reshape(-1, 1)\n Y = df[y_column].values\n reg = LinearRegression().fit(X, Y)\n Y_pred = reg.predict(X)\n\n fig, ax = plt.subplots()\n ax.scatter(X, Y)\n ax.plot(X, Y_pred, color=\"red\")\n\n return ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n def helper_assert_line_correctness(self, ax, expected_slope, expected_intercept):\n # Helper function to check if linear regression predictions are correct\n tolerance = 1e-6\n # Extract line data\n line = ax.lines[0]\n x_data, y_data = line.get_xdata(), line.get_ydata()\n # Calculate slope and intercept of the line plot\n calculated_slope = (y_data[-1] - y_data[0]) / (x_data[-1] - x_data[0])\n calculated_intercept = y_data[0] - calculated_slope * x_data[0]\n # Assert slope and intercept\n self.assertAlmostEqual(\n calculated_slope,\n expected_slope,\n delta=tolerance,\n msg=\"Slope did not match expected value\",\n )\n self.assertAlmostEqual(\n calculated_intercept,\n expected_intercept,\n delta=tolerance,\n msg=\"Intercept did not match expected value\",\n )\n def test_plot_attributes(self):\n # Basic case to test plot is correct\n df = pd.DataFrame({\"X\": [1, 2, 3, 4], \"Y\": [1, 2, 3, 4]})\n ax = f_827(df, \"X\", \"Y\")\n self.assertIsInstance(ax, Axes)\n self.assertEqual(len(ax.lines), 1)\n self.assertEqual(len(ax.collections), 1)\n def test_linear_positive_slope(self):\n # Testing with a dataset that should produce a positive slope\n df = pd.DataFrame({\"X\": [1, 2, 3, 4], \"Y\": [2, 4, 6, 8]})\n ax = f_827(df, \"X\", \"Y\")\n self.helper_assert_line_correctness(ax, expected_slope=2, expected_intercept=0)\n def test_linear_negative_slope(self):\n # Testing with a dataset that should produce a negative slope\n df = pd.DataFrame({\"X\": [1, 2, 3, 4], \"Y\": [8, 6, 4, 2]})\n ax = f_827(df, \"X\", \"Y\")\n self.helper_assert_line_correctness(\n ax, expected_slope=-2, expected_intercept=10\n )\n def test_linear_zero_slope(self):\n # Testing with a dataset that should produce a zero slope\n df = pd.DataFrame({\"X\": [1, 2, 3, 4], \"Y\": [5, 5, 5, 5]})\n ax = f_827(df, \"X\", \"Y\")\n self.helper_assert_line_correctness(ax, expected_slope=0, expected_intercept=5)\n def test_single_data_point(self):\n # Testing with a DataFrame having a single data point\n df = pd.DataFrame({\"X\": [1], \"Y\": [1]})\n ax = f_827(df, \"X\", \"Y\")\n self.assertIsInstance(ax, Axes)\n self.assertEqual(len(ax.lines), 1)\n self.assertEqual(len(ax.collections), 1)\n def test_missing_values(self):\n # Testing with missing values in the DataFrame\n df = pd.DataFrame({\"X\": [1, 2, np.nan, 4], \"Y\": [1, np.nan, 3, 4]})\n with self.assertRaises(ValueError):\n f_827(df, \"X\", \"Y\")\n def test_with_categorical_data(self):\n # Testing with categorical data to ensure it fails\n df = pd.DataFrame({\"X\": [\"a\", \"b\", \"c\"], \"Y\": [\"d\", \"e\", \"f\"]})\n with self.assertRaises(ValueError):\n f_827(df, \"X\", \"Y\")\n def test_incorrect_column_names(self):\n # Testing with incorrect column names\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n with self.assertRaises(KeyError):\n f_827(df, \"X\", \"Y\")", "apis": ["sklearn.linear_model.LinearRegression", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "sklearn"], "doc": {"description": ["Draws a scatter plot for the specified columns from a pandas DataFrame and fits a linear regression model to the data.", "Notes:", "- After plotting the scatterplot, this function overlays the predicted regression line on top in red on the same Axes."], "note": [], "params": ["df (DataFrame): The input pandas DataFrame.", "x_column (str): The column name for the x-axis. Data contained in column must be numeric.", "y_column (str): The column name for the y-axis. Data contained in column must be numeric."], "returns": ["matplotlib.axes._subplots.AxesSubplot: The Axes object containing the scatter plot and the linear regression line."], "reqs": ["pandas", "numpy", "matplotlib", "sklearn"], "raises": [], "example": [">>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})", ">>> ax = f_827(df, 'A', 'B')", ">>> type(ax)", ""]}} -{"task_id": "f_906", "prompt": "import pandas as pd\nfrom matplotlib import pyplot as plt\n\n\ndef f_906(arr):\n \"\"\"\n Calculate the sum of each row in a 2D numpy array and plot these sums as a time series.\n\n This function takes a 2D numpy array and computes the sum of elements in each row. It\n then creates a Pandas DataFrame with these row sums and plots them as a time series,\n using dates starting from January 1, 2020, for each row.\n\n Parameters:\n arr (numpy.ndarray): A 2D numpy array.\n\n Returns:\n matplotlib.axes._subplots.AxesSubplot: A plot representing the time series of row sums.\n\n Requirements:\n - pandas\n - matplotlib\n\n Handling Scenarios:\n - For non-empty arrays: The function computes the sum of elements for each row, \n stores these sums in a Pandas DataFrame, and then plots them. Each row in the plot represents \n the sum for a specific day, starting from January 1, 2020.\n - For empty arrays: The function creates an empty plot with the \n title 'Time Series of Row Sums' but without data. This is achieved by checking if the array size \n is zero (empty array) and if so, creating a subplot without any data.\n \n Note: \n - The function uses 'pandas' for DataFrame creation and 'matplotlib.pyplot' for plotting. \n The dates in the plot start from January 1, 2020, and each subsequent row represents the next day.\n \n Example:\n >>> arr = np.array([[i + j for i in range(3)] for j in range(5)])\n >>> ax = f_906(arr)\n >>> ax.get_title()\n 'Time Series of Row Sums'\n \"\"\"", "canonical_solution": " if not arr.size: # Check for empty array\n _, ax = plt.subplots()\n ax.set_title(\"Time Series of Row Sums\")\n return ax\n\n row_sums = arr.sum(axis=1)\n df = pd.DataFrame(row_sums, columns=[\"Sum\"])\n df.index = pd.date_range(start=\"1/1/2020\", periods=df.shape[0])\n ax = df.plot(title=\"Time Series of Row Sums\")\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_906.\"\"\"\n def test_basic_functionality(self):\n \"\"\"Test the basic functionality of the function.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n ax = f_906(arr)\n # Check if the function returns AxesSubplot object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted matches the expected sum of rows\n y_data = [line.get_ydata() for line in ax.get_lines()][0]\n expected_sums = arr.sum(axis=1)\n np.testing.assert_array_equal(y_data, expected_sums)\n def test_empty_array(self):\n \"\"\"Test the function with an empty array.\"\"\"\n arr = np.array([])\n ax = f_906(arr)\n # Check if the function returns AxesSubplot object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted is empty\n lines = ax.get_lines()\n self.assertEqual(len(lines), 0)\n def test_single_row_array(self):\n \"\"\"Test the function with a single row array.\"\"\"\n arr = np.array([[1, 2, 3]])\n ax = f_906(arr)\n # Check if the function returns AxesSubplot object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted matches the expected sum of the single row\n y_data = [line.get_ydata() for line in ax.get_lines()][0]\n expected_sum = arr.sum(axis=1)\n np.testing.assert_array_equal(y_data, expected_sum)\n def test_negative_values(self):\n \"\"\"Test the function with negative values.\"\"\"\n arr = np.array([[-1, -2, -3], [-4, -5, -6]])\n ax = f_906(arr)\n # Check if the function returns AxesSubplot object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted matches the expected sum of rows\n y_data = [line.get_ydata() for line in ax.get_lines()][0]\n expected_sums = arr.sum(axis=1)\n np.testing.assert_array_equal(y_data, expected_sums)\n def test_zero_values(self):\n \"\"\"Test the function with zero values.\"\"\"\n arr = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])\n ax = f_906(arr)\n # Check if the function returns AxesSubplot object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted matches the expected sum of rows\n y_data = [line.get_ydata() for line in ax.get_lines()][0]\n expected_sums = arr.sum(axis=1)\n np.testing.assert_array_equal(y_data, expected_sums)\n def tearDown(self):\n plt.close()", "apis": ["pandas.date_range", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Calculate the sum of each row in a 2D numpy array and plot these sums as a time series.", "This function takes a 2D numpy array and computes the sum of elements in each row. It", "then creates a Pandas DataFrame with these row sums and plots them as a time series,", "using dates starting from January 1, 2020, for each row.", "Handling Scenarios:", "- For non-empty arrays: The function computes the sum of elements for each row,", "stores these sums in a Pandas DataFrame, and then plots them. Each row in the plot represents", "the sum for a specific day, starting from January 1, 2020.", "- For empty arrays: The function creates an empty plot with the", "title 'Time Series of Row Sums' but without data. This is achieved by checking if the array size", "is zero (empty array) and if so, creating a subplot without any data."], "note": ["The function uses 'pandas' for DataFrame creation and 'matplotlib.pyplot' for plotting.", "The dates in the plot start from January 1, 2020, and each subsequent row represents the next day."], "params": ["arr (numpy.ndarray): A 2D numpy array."], "returns": ["matplotlib.axes._subplots.AxesSubplot: A plot representing the time series of row sums."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> arr = np.array([[i + j for i in range(3)] for j in range(5)])", ">>> ax = f_906(arr)", ">>> ax.get_title()", "'Time Series of Row Sums'"]}} +{"task_id": "f_545", "prompt": "import pandas as pd\nimport numpy as np\n\ndef f_545(df, col):\n \"\"\"\n Process a Pandas DataFrame by removing a specific column and adding a 'IsEvenIndex' column.\n The 'IsEvenIndex' column is a boolean flag indicating if the index of each row is even.\n \n Parameters:\n - df (pd.DataFrame): The pandas DataFrame to process.\n - col (str): The column to remove.\n\n Returns:\n - df (pd.DataFrame): The processed pandas DataFrame with the specified column removed and a new 'IsEvenIndex' column added.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.randint(0,100,size=(5, 4)), columns=list('ABCD'))\n >>> df = f_545(df, 'C')\n >>> print(df)\n A B D IsEvenIndex\n 0 51 92 71 True\n 1 60 20 86 False\n 2 74 74 99 True\n 3 23 2 52 False\n 4 1 87 37 True\n \"\"\"", "canonical_solution": " # Remove specified column using pandas\n updated_df = pd.DataFrame(df).drop(col, axis=1)\n \n # Add a new column 'IsEvenIndex' using numpy to determine if index is even\n # The np.arange(len(updated_df)) creates an array of indexes, % 2 == 0 checks if they are even\n updated_df['IsEvenIndex'] = np.arange(len(updated_df)) % 2 == 0\n \n return updated_df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'A')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('A' in df.columns)\n def test_case_2(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'B')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('B' in df.columns)\n def test_case_3(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'C')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('C' in df.columns)\n def test_case_4(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'D')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('D' in df.columns)\n def test_case_5(self):\n df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n df = f_545(df, 'A')\n self.assertEqual(df.shape, (100, 4))\n self.assertFalse('A' in df.columns)", "apis": ["pandas.DataFrame", "numpy.arange"], "libs": ["numpy", "pandas"], "doc": {"description": ["Process a Pandas DataFrame by removing a specific column and adding a 'IsEvenIndex' column.", "The 'IsEvenIndex' column is a boolean flag indicating if the index of each row is even."], "note": [], "params": ["df (pd.DataFrame): The pandas DataFrame to process.", "col (str): The column to remove."], "returns": ["df (pd.DataFrame): The processed pandas DataFrame with the specified column removed and a new 'IsEvenIndex' column added."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.randint(0,100,size=(5, 4)), columns=list('ABCD'))", ">>> df = f_545(df, 'C')", ">>> print(df)", "A B D IsEvenIndex", "0 51 92 71 True", "1 60 20 86 False", "2 74 74 99 True", "3 23 2 52 False", "4 1 87 37 True"]}} +{"task_id": "f_385", "prompt": "import os\nimport pandas as pd\nimport re\n\n\ndef f_385(file_path: str) -> pd.DataFrame:\n \"\"\"\n Parse a log file to extract log entries into a DataFrame.\n\n This function reads a log file line by line. The log file is assumed to follow this format\n for each entry: YYYY-MM-DD HH:MM:SS.ssssss - LEVEL - Message\n The function matches each line against a predefined regular expression to extract timestamp,\n log level, and message, ignoring lines where there is no match. It then aggregates the matched\n and extracted data into a pandas DataFrame with columns: 'Timestamp', 'Level', and 'Message'.\n If the logs are empty or there is no extracted data, this function returns an otherwise empty\n DataFrame containing the same expected columns.\n\n Parameters:\n - file_path (str): The path to the log file to be parsed.\n\n Returns:\n - pd.DataFrame: A DataFrame with columns 'Timestamp', 'Level', and 'Message'.\n\n Requirements:\n - re\n - os\n - pandas\n\n Example:\n Given a log file with content:\n ```\n 2023-01-01 12:00:00.000000 - INFO - Application started\n 2023-01-01 12:01:00.000000 - ERROR - Failed to connect to database\n ```\n >>> df = f_385(\"path_to_log_file.txt\")\n >>> type(df)\n \n >>> df.iloc[0]\n Timestamp 2023-01-01 12:00:00.000000\n Level INFO\n Message Application started\n Name: 0, dtype: object\n \"\"\"", "canonical_solution": " LOG_REGEX = r\"(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}) - (\\w+) - (.+)$\"\n\n if not os.path.exists(file_path):\n raise FileNotFoundError(f\"The file {file_path} does not exist.\")\n\n logs = []\n with open(file_path, \"r\") as f:\n for line in f:\n match = re.match(LOG_REGEX, line)\n if match:\n timestamp, level, message = match.groups()\n logs.append([timestamp, level, message])\n\n df = pd.DataFrame(logs, columns=[\"Timestamp\", \"Level\", \"Message\"])\n\n if df.empty:\n df = pd.DataFrame(columns=[\"Timestamp\", \"Level\", \"Message\"])\n\n return df", "test": "import unittest\nimport tempfile\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n def tearDown(self):\n self.temp_dir.cleanup()\n def _create_temp_log_file(self, file_name: str, content: str):\n \"\"\"Helper function to create a temporary log file.\"\"\"\n path = os.path.join(self.temp_dir.name, file_name)\n with open(path, \"w\") as f:\n f.write(content)\n return path\n def test_case_1(self):\n # Test log file with mixed levels\n content = (\n \"2023-01-01 12:00:00.000000 - INFO - Application started\\n\"\n \"2023-01-01 12:01:00.000000 - ERROR - Failed to connect to database\\n\"\n )\n log_file_path = self._create_temp_log_file(\"log1.txt\", content)\n df = f_385(log_file_path)\n self.assertEqual(len(df), 2)\n self.assertEqual(df.iloc[0][\"Level\"], \"INFO\")\n self.assertEqual(df.iloc[1][\"Level\"], \"ERROR\")\n def test_case_2(self):\n # Test case for an empty log file\n log_file_path = self._create_temp_log_file(\"log2.txt\", \"\")\n df = f_385(log_file_path)\n self.assertTrue(df.empty)\n def test_case_3(self):\n # Log file with lines that do not match the expected format\n content = \"This is not a valid log entry\\n2023-01-02 13:00:00.000000 - WARNING - Low disk space\\n\"\n log_file_path = self._create_temp_log_file(\"log3.txt\", content)\n df = f_385(log_file_path)\n self.assertEqual(len(df), 1)\n self.assertEqual(df.iloc[0][\"Level\"], \"WARNING\")\n def test_caes_4(self):\n # Test case to ensure FileNotFoundError is raised when log file does not exist\n with self.assertRaises(FileNotFoundError):\n f_385(\"/path/to/nonexistent/file.txt\")\n def test_case_5(self):\n # Log file with some entries having minor formatting issues\n content = (\n \"2023-01-03 14:00:00.000000 - DEBUG - Debugging info included\\n\"\n \"2023-01-03 Not a valid entry\\n\"\n \"WARNING - This log entry is missing its timestamp\\n\"\n \"2023-01-04 15:00:00.000000 - INFO - System update completed\\n\"\n \"Some random text not conforming to the log format\\n\"\n \"2023-01-04 16:00:00.000000 - ERROR - Error in processing\\n\"\n )\n log_file_path = self._create_temp_log_file(\"log5.txt\", content)\n df = f_385(log_file_path)\n self.assertEqual(len(df), 3)\n self.assertEqual(df.iloc[0][\"Level\"], \"DEBUG\")\n self.assertEqual(df.iloc[1][\"Level\"], \"INFO\")\n self.assertEqual(df.iloc[2][\"Level\"], \"ERROR\")\n def test_case_6(self):\n # Log file with multi-line entries\n content = (\n \"2023-02-01 10:00:00.000000 - INFO - Application start successful\\n\"\n \"2023-02-01 10:05:00.000000 - ERROR - Exception occurred:\\n\"\n \"Traceback (most recent call last):\\n\"\n ' File \"\", line 1, in \\n'\n \"ZeroDivisionError: division by zero\\n\"\n \"2023-02-01 10:10:00.000000 - INFO - Recovery attempt initiated\\n\"\n )\n log_file_path = self._create_temp_log_file(\"log6.txt\", content)\n df = f_385(log_file_path)\n self.assertEqual(len(df), 3)\n self.assertEqual(df.iloc[0][\"Level\"], \"INFO\")\n self.assertEqual(df.iloc[1][\"Level\"], \"ERROR\")\n self.assertEqual(df.iloc[2][\"Level\"], \"INFO\")\n self.assertTrue(\"Exception occurred:\" in df.iloc[1][\"Message\"])\n self.assertFalse(\n \"Traceback\" in df.iloc[1][\"Message\"]\n or \"ZeroDivisionError\" in df.iloc[1][\"Message\"]\n )", "apis": ["re.match", "os.path", "os.path.exists", "pandas.DataFrame"], "libs": ["re", "pandas", "os"], "doc": {"description": ["Parse a log file to extract log entries into a DataFrame.", "This function reads a log file line by line. The log file is assumed to follow this format", "for each entry: YYYY-MM-DD HH:MM:SS.ssssss - LEVEL - Message", "The function matches each line against a predefined regular expression to extract timestamp,", "log level, and message, ignoring lines where there is no match. It then aggregates the matched", "and extracted data into a pandas DataFrame with columns: 'Timestamp', 'Level', and 'Message'.", "If the logs are empty or there is no extracted data, this function returns an otherwise empty", "DataFrame containing the same expected columns."], "note": [], "params": ["file_path (str): The path to the log file to be parsed."], "returns": ["pd.DataFrame: A DataFrame with columns 'Timestamp', 'Level', and 'Message'."], "reqs": ["re", "os", "pandas"], "raises": [], "example": ["Given a log file with content:", "```", "2023-01-01 12:00:00.000000 - INFO - Application started", "2023-01-01 12:01:00.000000 - ERROR - Failed to connect to database", "```", ">>> df = f_385(\"path_to_log_file.txt\")", ">>> type(df)", "", ">>> df.iloc[0]", "Timestamp 2023-01-01 12:00:00.000000", "Level INFO", "Message Application started", "Name: 0, dtype: object"]}} +{"task_id": "f_870", "prompt": "import binascii\nimport urllib.parse\n\n\ndef f_870(url):\n \"\"\"\n Decode a hexadecimal string from the 'q' query parameter of a URL.\n\n This function extracts the 'q' query parameter from the given URL,\n assumes it is a hexadecimal string, and decodes it into a UTF-8 string.\n If the hexadecimal string is invalid or cannot be decoded into a valid UTF-8 string, None is returned.\n\n Parameters:\n url (str): The URL to extract the query parameter from.\n\n Returns:\n str or None: The decoded string if the 'q' parameter exists and is a valid hexadecimal, otherwise None.\n\n Requirements:\n - binascii\n - urllib.parse\n \n Example:\n >>> f_870('https://www.example.com?q=4a4b4c')\n 'JKL'\n \"\"\"", "canonical_solution": " try:\n parsed_url = urllib.parse.urlparse(url)\n query = urllib.parse.parse_qs(parsed_url.query).get(\"q\", [None])[0]\n return binascii.unhexlify(query).decode(\"utf-8\") if query else None\n except (binascii.Error, UnicodeDecodeError):\n return None", "test": "import unittest\nimport binascii\nimport urllib.parse\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_870.\"\"\"\n def test_valid_hex_string(self):\n \"\"\"Test with a valid hex string in query parameter.\"\"\"\n url = \"https://www.example.com?q=4a4b4c\"\n self.assertEqual(f_870(url), \"JKL\")\n def test_no_query_parameter(self):\n \"\"\"Test with no query parameter.\"\"\"\n url = \"https://www.example.com\"\n self.assertIsNone(f_870(url))\n def test_invalid_hex_string(self):\n \"\"\"Test with an invalid hex string in query parameter.\"\"\"\n url = \"https://www.example.com?q=4a4b4c4d4\"\n self.assertIsNone(\n f_870(url)\n ) # Updated to assertIsNone as the function now handles the exception\n def test_valid_hex_non_utf8(self):\n \"\"\"Test with a valid hex string that is not valid UTF-8.\"\"\"\n url = \"https://www.example.com?q=80\"\n self.assertIsNone(\n f_870(url)\n ) # Updated to assertIsNone due to the handling of UnicodeDecodeError\n def test_multiple_query_parameters(self):\n \"\"\"Test with multiple query parameters.\"\"\"\n url = \"https://www.example.com?a=123&q=4a4b4c&b=456\"\n self.assertEqual(f_870(url), \"JKL\")", "apis": ["binascii.Error", "binascii.unhexlify", "urllib.parse.urlparse", "urllib.parse", "urllib.parse.parse_qs"], "libs": ["urllib", "binascii"], "doc": {"description": ["Decode a hexadecimal string from the 'q' query parameter of a URL.", "This function extracts the 'q' query parameter from the given URL,", "assumes it is a hexadecimal string, and decodes it into a UTF-8 string.", "If the hexadecimal string is invalid or cannot be decoded into a valid UTF-8 string, None is returned."], "note": [], "params": ["url (str): The URL to extract the query parameter from."], "returns": ["str or None: The decoded string if the 'q' parameter exists and is a valid hexadecimal, otherwise None."], "reqs": ["binascii", "urllib.parse"], "raises": [], "example": [">>> f_870('https://www.example.com?q=4a4b4c')", "'JKL'"]}} +{"task_id": "f_827", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\n\n\ndef f_827(df, x_column, y_column):\n \"\"\"\n Draws a scatter plot for the specified columns from a pandas DataFrame and fits a linear regression model to the data.\n\n Parameters:\n df (DataFrame): The input pandas DataFrame.\n x_column (str): The column name for the x-axis. Data contained in column must be numeric.\n y_column (str): The column name for the y-axis. Data contained in column must be numeric.\n\n Returns:\n matplotlib.axes._subplots.Axes: The Axes object containing the scatter plot and the linear regression line.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib\n - sklearn\n\n Notes:\n - After plotting the scatterplot, this function overlays the predicted regression line on top in red on the same Axes.\n\n Example:\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})\n >>> ax = f_827(df, 'A', 'B')\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " X = df[x_column].values.reshape(-1, 1)\n Y = df[y_column].values\n reg = LinearRegression().fit(X, Y)\n Y_pred = reg.predict(X)\n\n fig, ax = plt.subplots()\n ax.scatter(X, Y)\n ax.plot(X, Y_pred, color=\"red\")\n\n return ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n def helper_assert_line_correctness(self, ax, expected_slope, expected_intercept):\n # Helper function to check if linear regression predictions are correct\n tolerance = 1e-6\n # Extract line data\n line = ax.lines[0]\n x_data, y_data = line.get_xdata(), line.get_ydata()\n # Calculate slope and intercept of the line plot\n calculated_slope = (y_data[-1] - y_data[0]) / (x_data[-1] - x_data[0])\n calculated_intercept = y_data[0] - calculated_slope * x_data[0]\n # Assert slope and intercept\n self.assertAlmostEqual(\n calculated_slope,\n expected_slope,\n delta=tolerance,\n msg=\"Slope did not match expected value\",\n )\n self.assertAlmostEqual(\n calculated_intercept,\n expected_intercept,\n delta=tolerance,\n msg=\"Intercept did not match expected value\",\n )\n def test_plot_attributes(self):\n # Basic case to test plot is correct\n df = pd.DataFrame({\"X\": [1, 2, 3, 4], \"Y\": [1, 2, 3, 4]})\n ax = f_827(df, \"X\", \"Y\")\n self.assertIsInstance(ax, Axes)\n self.assertEqual(len(ax.lines), 1)\n self.assertEqual(len(ax.collections), 1)\n def test_linear_positive_slope(self):\n # Testing with a dataset that should produce a positive slope\n df = pd.DataFrame({\"X\": [1, 2, 3, 4], \"Y\": [2, 4, 6, 8]})\n ax = f_827(df, \"X\", \"Y\")\n self.helper_assert_line_correctness(ax, expected_slope=2, expected_intercept=0)\n def test_linear_negative_slope(self):\n # Testing with a dataset that should produce a negative slope\n df = pd.DataFrame({\"X\": [1, 2, 3, 4], \"Y\": [8, 6, 4, 2]})\n ax = f_827(df, \"X\", \"Y\")\n self.helper_assert_line_correctness(\n ax, expected_slope=-2, expected_intercept=10\n )\n def test_linear_zero_slope(self):\n # Testing with a dataset that should produce a zero slope\n df = pd.DataFrame({\"X\": [1, 2, 3, 4], \"Y\": [5, 5, 5, 5]})\n ax = f_827(df, \"X\", \"Y\")\n self.helper_assert_line_correctness(ax, expected_slope=0, expected_intercept=5)\n def test_single_data_point(self):\n # Testing with a DataFrame having a single data point\n df = pd.DataFrame({\"X\": [1], \"Y\": [1]})\n ax = f_827(df, \"X\", \"Y\")\n self.assertIsInstance(ax, Axes)\n self.assertEqual(len(ax.lines), 1)\n self.assertEqual(len(ax.collections), 1)\n def test_missing_values(self):\n # Testing with missing values in the DataFrame\n df = pd.DataFrame({\"X\": [1, 2, np.nan, 4], \"Y\": [1, np.nan, 3, 4]})\n with self.assertRaises(ValueError):\n f_827(df, \"X\", \"Y\")\n def test_with_categorical_data(self):\n # Testing with categorical data to ensure it fails\n df = pd.DataFrame({\"X\": [\"a\", \"b\", \"c\"], \"Y\": [\"d\", \"e\", \"f\"]})\n with self.assertRaises(ValueError):\n f_827(df, \"X\", \"Y\")\n def test_incorrect_column_names(self):\n # Testing with incorrect column names\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n with self.assertRaises(KeyError):\n f_827(df, \"X\", \"Y\")", "apis": ["matplotlib.pyplot.subplots", "sklearn.linear_model.LinearRegression"], "libs": ["sklearn", "matplotlib"], "doc": {"description": ["Draws a scatter plot for the specified columns from a pandas DataFrame and fits a linear regression model to the data.", "Notes:", "- After plotting the scatterplot, this function overlays the predicted regression line on top in red on the same Axes."], "note": [], "params": ["df (DataFrame): The input pandas DataFrame.", "x_column (str): The column name for the x-axis. Data contained in column must be numeric.", "y_column (str): The column name for the y-axis. Data contained in column must be numeric."], "returns": ["matplotlib.axes._subplots.Axes: The Axes object containing the scatter plot and the linear regression line."], "reqs": ["pandas", "numpy", "matplotlib", "sklearn"], "raises": [], "example": [">>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})", ">>> ax = f_827(df, 'A', 'B')", ">>> type(ax)", ""]}} +{"task_id": "f_906", "prompt": "import pandas as pd\nfrom matplotlib import pyplot as plt\n\n\ndef f_906(arr):\n \"\"\"\n Calculate the sum of each row in a 2D numpy array and plot these sums as a time series.\n\n This function takes a 2D numpy array and computes the sum of elements in each row. It\n then creates a Pandas DataFrame with these row sums and plots them as a time series,\n using dates starting from January 1, 2020, for each row.\n\n Parameters:\n arr (numpy.ndarray): A 2D numpy array.\n\n Returns:\n matplotlib.axes._subplots.Axes: A plot representing the time series of row sums.\n\n Requirements:\n - pandas\n - matplotlib\n\n Handling Scenarios:\n - For non-empty arrays: The function computes the sum of elements for each row, \n stores these sums in a Pandas DataFrame, and then plots them. Each row in the plot represents \n the sum for a specific day, starting from January 1, 2020.\n - For empty arrays: The function creates an empty plot with the \n title 'Time Series of Row Sums' but without data. This is achieved by checking if the array size \n is zero (empty array) and if so, creating a subplot without any data.\n \n Note: \n - The function uses 'pandas' for DataFrame creation and 'matplotlib.pyplot' for plotting. \n The dates in the plot start from January 1, 2020, and each subsequent row represents the next day.\n \n Example:\n >>> arr = np.array([[i + j for i in range(3)] for j in range(5)])\n >>> ax = f_906(arr)\n >>> ax.get_title()\n 'Time Series of Row Sums'\n \"\"\"", "canonical_solution": " if not arr.size: # Check for empty array\n _, ax = plt.subplots()\n ax.set_title(\"Time Series of Row Sums\")\n return ax\n\n row_sums = arr.sum(axis=1)\n df = pd.DataFrame(row_sums, columns=[\"Sum\"])\n df.index = pd.date_range(start=\"1/1/2020\", periods=df.shape[0])\n ax = df.plot(title=\"Time Series of Row Sums\")\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_906.\"\"\"\n def test_basic_functionality(self):\n \"\"\"Test the basic functionality of the function.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n ax = f_906(arr)\n # Check if the function returns Axes object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted matches the expected sum of rows\n y_data = [line.get_ydata() for line in ax.get_lines()][0]\n expected_sums = arr.sum(axis=1)\n np.testing.assert_array_equal(y_data, expected_sums)\n def test_empty_array(self):\n \"\"\"Test the function with an empty array.\"\"\"\n arr = np.array([])\n ax = f_906(arr)\n # Check if the function returns Axes object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted is empty\n lines = ax.get_lines()\n self.assertEqual(len(lines), 0)\n def test_single_row_array(self):\n \"\"\"Test the function with a single row array.\"\"\"\n arr = np.array([[1, 2, 3]])\n ax = f_906(arr)\n # Check if the function returns Axes object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted matches the expected sum of the single row\n y_data = [line.get_ydata() for line in ax.get_lines()][0]\n expected_sum = arr.sum(axis=1)\n np.testing.assert_array_equal(y_data, expected_sum)\n def test_negative_values(self):\n \"\"\"Test the function with negative values.\"\"\"\n arr = np.array([[-1, -2, -3], [-4, -5, -6]])\n ax = f_906(arr)\n # Check if the function returns Axes object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted matches the expected sum of rows\n y_data = [line.get_ydata() for line in ax.get_lines()][0]\n expected_sums = arr.sum(axis=1)\n np.testing.assert_array_equal(y_data, expected_sums)\n def test_zero_values(self):\n \"\"\"Test the function with zero values.\"\"\"\n arr = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])\n ax = f_906(arr)\n # Check if the function returns Axes object\n self.assertIsInstance(ax, plt.Axes)\n # Check the title of the plot\n self.assertEqual(ax.get_title(), \"Time Series of Row Sums\")\n # Check if the data plotted matches the expected sum of rows\n y_data = [line.get_ydata() for line in ax.get_lines()][0]\n expected_sums = arr.sum(axis=1)\n np.testing.assert_array_equal(y_data, expected_sums)\n def tearDown(self):\n plt.close()", "apis": ["matplotlib.pyplot.subplots", "pandas.date_range", "pandas.DataFrame"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Calculate the sum of each row in a 2D numpy array and plot these sums as a time series.", "This function takes a 2D numpy array and computes the sum of elements in each row. It", "then creates a Pandas DataFrame with these row sums and plots them as a time series,", "using dates starting from January 1, 2020, for each row.", "Handling Scenarios:", "- For non-empty arrays: The function computes the sum of elements for each row,", "stores these sums in a Pandas DataFrame, and then plots them. Each row in the plot represents", "the sum for a specific day, starting from January 1, 2020.", "- For empty arrays: The function creates an empty plot with the", "title 'Time Series of Row Sums' but without data. This is achieved by checking if the array size", "is zero (empty array) and if so, creating a subplot without any data."], "note": ["The function uses 'pandas' for DataFrame creation and 'matplotlib.pyplot' for plotting.", "The dates in the plot start from January 1, 2020, and each subsequent row represents the next day."], "params": ["arr (numpy.ndarray): A 2D numpy array."], "returns": ["matplotlib.axes._subplots.Axes: A plot representing the time series of row sums."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> arr = np.array([[i + j for i in range(3)] for j in range(5)])", ">>> ax = f_906(arr)", ">>> ax.get_title()", "'Time Series of Row Sums'"]}} {"task_id": "f_543", "prompt": "from collections import Counter\nimport math\n\ndef f_543(nested_dict):\n \"\"\"\n Aggregate the values of the same keys from a nested dictionary and remove the \"ele\" key. For each remaining key take the sine.\n \n Parameters:\n - nested_dict (dict): The nested dictionary. Default is NESTED_DICT constant.\n \n Returns:\n - dict: A dictionary with aggregated values.\n\n Requirements:\n - math\n - collections\n\n Example:\n >>> f_543({\n ... 'dict1': {'ale': 1, 'ele': 2, 'ile': 3},\n ... 'dict2': {'ele': 4, 'ole': 5, 'ule': 6},\n ... 'dict3': {'ile': 7, 'ale': 8, 'ele': 9}\n ... })\n {'ale': 0.4121184852417566, 'ile': -0.5440211108893698, 'ole': -0.9589242746631385, 'ule': -0.27941549819892586}\n \"\"\"", "canonical_solution": " counter = Counter()\n for sub_dict in nested_dict.values():\n counter.update(sub_dict)\n\n counter.pop('ele', None)\n\n return {k: math.sin(v) for k,v in counter.items()}", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_543({\n 'dict1': {'ale': 1, 'ele': 2, 'ile': 3},\n 'dict2': {'ele': 4, 'ole': 5, 'ule': 6},\n 'dict3': {'ile': 7, 'ale': 8, 'ele': 9}\n }), {'ale': math.sin(9), 'ile': math.sin(10), 'ole': math.sin(5), 'ule': math.sin(6)})\n def test_case_2(self):\n self.assertEqual(f_543({\n 'aaa': {'zzz': 1, 'yyy': 2, 'xxx': 3},\n 'bbb': {'yyy': 4, 'xxx': 5, 'www': 6},\n 'ccc': {'xxx': 7, 'www': 8, 'ele': 9},\n 'ddd': {'www': 10, 'ele': 11, 'zzz': 12}\n }), {'zzz': math.sin(13), 'yyy': math.sin(6), 'xxx': math.sin(15), 'www': math.sin(24)})\n def test_case_3(self):\n self.assertEqual(f_543({\n 'x': {'a': 1, 'b': 2, 'c': 3},\n 'y': {'b': 4, 'c': 5, 'd': 6},\n 'z': {'c': 7, 'd': 8, 'e': 9}\n }), {'a': math.sin(1), 'b': math.sin(6), 'c': math.sin(15), 'd': math.sin(14), 'e': math.sin(9)})\n def test_case_4(self):\n self.assertEqual(f_543({\n 'x': {'a': 1, 'b': 2, 'c': 3},\n 'y': {'b': 4, 'c': 5, 'd': 6},\n 'z': {'c': 7, 'd': 8, 'ele': 9}\n }), {'a': math.sin(1), 'b': math.sin(6), 'c': math.sin(15), 'd': math.sin(14)})\n def test_case_5(self):\n self.assertEqual(f_543({\n 1: {1: 1, 2: 2, 3: 3},\n 2: {2: 4, 3: 5, 4: 6},\n 3: {3: 7, 4: 8, 5: 9}\n }), {1: math.sin(1), 2: math.sin(6), 3: math.sin(15), 4: math.sin(14), 5: math.sin(9)})", "apis": ["math.sin", "collections.Counter"], "libs": ["collections", "math"], "doc": {"description": ["Aggregate the values of the same keys from a nested dictionary and remove the \"ele\" key. For each remaining key take the sine."], "note": [], "params": ["nested_dict (dict): The nested dictionary. Default is NESTED_DICT constant."], "returns": ["dict: A dictionary with aggregated values."], "reqs": ["math", "collections"], "raises": [], "example": [">>> f_543({", "... 'dict1': {'ale': 1, 'ele': 2, 'ile': 3},", "... 'dict2': {'ele': 4, 'ole': 5, 'ule': 6},", "... 'dict3': {'ile': 7, 'ale': 8, 'ele': 9}", "... })", "{'ale': 0.4121184852417566, 'ile': -0.5440211108893698, 'ole': -0.9589242746631385, 'ule': -0.27941549819892586}"]}} -{"task_id": "f_923", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_923(data):\n \"\"\"\n Processes a dictionary containing product names and their corresponding prices in string format. \n The function converts these string prices (which may include commas as thousand separators) into float values. \n It then calculates statistical measures (mean, median, and standard deviation) of these prices and \n generates a histogram to visually represent the distribution of the prices.\n\n Parameters:\n - data (dict): A dictionary with two keys: 'Product' and 'Price_String'. \n 'Product' is a list of product names, each name corresponding to a product.\n 'Price_String' is a list of prices in string format, associated with these products. \n The price strings can contain commas for thousand separators and a period for the decimal point (e.g., \"1,234.56\").\n\n Returns:\n - dict: Contains the calculated mean, median, and standard deviation (sample) of the prices. \n The keys are 'mean', 'median', and 'std_dev'.\n - matplotlib.axes._subplots.AxesSubplot: A subplot object that represents the histogram plot of the product prices. \n The histogram displays the frequency distribution of the prices.\n\n Note:\n - A histogram plot is generated using these prices, with automatic bin sizing ('auto'), a blue color, \n 70% opacity (alpha=0.7), and a relative width (rwidth) of 0.85 for the bars. \n - The histogram's title is set to 'Histogram of Product Prices', and the x and y-axis are labeled 'Price' and 'Frequency', respectively.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib\n\n Example:\n >>> results = f_923({'Product': ['Apple', 'Banana'], 'Price_String': ['1,234.00', '567.89']})\n >>> print(results)\n ({'mean': 900.9449999999999, 'median': 900.9449999999999, 'std_dev': 471.0108980161712}, (array([1., 1.]), array([ 567.89 , 900.945, 1234. ]), ))\n\n Note:\n - The function assumes that each product name in the 'Product' list has a corresponding price in the 'Price_String' list.\n - The histogram plot's appearance (like color, alpha, and rwidth) is pre-set but can be customized further if needed.\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n # Correctly convert string prices to float, accounting for commas\n df[\"Price_Float\"] = df[\"Price_String\"].apply(lambda x: float(x.replace(\",\", \"\")))\n\n mean_price = np.mean(df[\"Price_Float\"])\n median_price = np.median(df[\"Price_Float\"])\n # Use ddof=1 for sample standard deviation\n std_dev_price = np.std(df[\"Price_Float\"], ddof=1)\n\n # Histogram plot settings can be refined for better visualization\n ax = plt.hist(df[\"Price_Float\"], bins=\"auto\", color=\"blue\", alpha=0.7, rwidth=0.85)\n plt.title(\"Histogram of Product Prices\")\n plt.xlabel(\"Price\")\n plt.ylabel(\"Frequency\")\n\n return {\"mean\": mean_price, \"median\": median_price, \"std_dev\": std_dev_price}, ax", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_923\"\"\"\n def test_basic_functionality(self):\n \"\"\"Test basic functionality.\"\"\"\n sample_data = {\n \"Product\": [\"James\", \"Olivia\", \"Jamie\", \"Angela\", \"Jennifer\"],\n \"Price_String\": [\"2,213.00\", \"6,083.00\", \"5,461.00\", \"884.00\", \"2,783.00\"],\n }\n float_prices = [\n float(price.replace(\",\", \"\")) for price in sample_data[\"Price_String\"]\n ]\n expected_mean = np.mean(float_prices)\n expected_median = np.median(float_prices)\n expected_std_dev = np.std(float_prices, ddof=1)\n result, _ = f_923(sample_data)\n self.assertAlmostEqual(result[\"mean\"], expected_mean)\n self.assertAlmostEqual(result[\"median\"], expected_median)\n self.assertAlmostEqual(result[\"std_dev\"], expected_std_dev)\n def test_large_sample_size(self):\n \"\"\"Test large sample size.\"\"\"\n sample_data = {\n \"Product\": [\n \"Adam\",\n \"Lisa\",\n \"Scott\",\n \"Bianca\",\n \"Ashlee\",\n \"Shannon\",\n \"Michelle\",\n \"Robert\",\n \"Joseph\",\n \"Joshua\",\n \"Traci\",\n \"Jacob\",\n \"Daniel\",\n \"Timothy\",\n \"Paul\",\n ],\n \"Price_String\": [\n \"1,691.00\",\n \"967.00\",\n \"5,789.00\",\n \"6,806.00\",\n \"3,301.00\",\n \"5,319.00\",\n \"7,619.00\",\n \"134.00\",\n \"7,883.00\",\n \"5,028.00\",\n \"3,330.00\",\n \"5,253.00\",\n \"8,551.00\",\n \"1,631.00\",\n \"7,637.00\",\n ],\n }\n float_prices = [\n float(price.replace(\",\", \"\")) for price in sample_data[\"Price_String\"]\n ]\n expected_mean = np.mean(float_prices)\n expected_median = np.median(float_prices)\n expected_std_dev = np.std(float_prices, ddof=1)\n result, _ = f_923(sample_data)\n self.assertAlmostEqual(result[\"mean\"], expected_mean)\n self.assertAlmostEqual(result[\"median\"], expected_median)\n self.assertAlmostEqual(result[\"std_dev\"], expected_std_dev)\n def test_invalid_input(self):\n \"\"\"Test invalid input.\"\"\"\n with self.assertRaises(Exception):\n f_923({})\n with self.assertRaises(Exception):\n f_923({\"Product\": [\"Apple\"], \"Price_WrongKey\": [\"1,234.00\"]})\n def test_all_zero_prices(self):\n \"\"\"Test all zero prices.\"\"\"\n sample_data = {\n \"Product\": [\"Apple\", \"Banana\", \"Cherry\"],\n \"Price_String\": [\"0.00\", \"0.00\", \"0.00\"],\n }\n result, _ = f_923(sample_data)\n self.assertEqual(result[\"mean\"], 0)\n self.assertEqual(result[\"median\"], 0)\n self.assertEqual(result[\"std_dev\"], 0)\n def test_non_uniform_distribution(self):\n \"\"\"Test non-uniform distribution.\"\"\"\n sample_data = {\n \"Product\": [\"Apple\", \"Banana\", \"Cherry\", \"Date\", \"Fig\"],\n \"Price_String\": [\"1,000.00\", \"500.00\", \"1,500.00\", \"2,000.00\", \"2,500.00\"],\n }\n float_prices = [\n float(price.replace(\",\", \"\")) for price in sample_data[\"Price_String\"]\n ]\n expected_mean = np.mean(float_prices)\n expected_median = np.median(float_prices)\n expected_std_dev = np.std(float_prices, ddof=1)\n result, _ = f_923(sample_data)\n self.assertAlmostEqual(result[\"mean\"], expected_mean)\n self.assertAlmostEqual(result[\"median\"], expected_median)\n self.assertAlmostEqual(result[\"std_dev\"], expected_std_dev)\n def tearDown(self):\n plt.close()", "apis": ["numpy.median", "numpy.std", "numpy.mean", "matplotlib.pyplot.xlabel", "matplotlib.pyplot.hist", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "pandas.DataFrame"], "libs": ["numpy", "matplotlib", "pandas"], "doc": {"description": ["Processes a dictionary containing product names and their corresponding prices in string format.", "The function converts these string prices (which may include commas as thousand separators) into float values.", "It then calculates statistical measures (mean, median, and standard deviation) of these prices and", "generates a histogram to visually represent the distribution of the prices."], "note": ["A histogram plot is generated using these prices, with automatic bin sizing ('auto'), a blue color,", "70% opacity (alpha=0.7), and a relative width (rwidth) of 0.85 for the bars.", "The histogram's title is set to 'Histogram of Product Prices', and the x and y-axis are labeled 'Price' and 'Frequency', respectively.", "The function assumes that each product name in the 'Product' list has a corresponding price in the 'Price_String' list.", "The histogram plot's appearance (like color, alpha, and rwidth) is pre-set but can be customized further if needed."], "params": ["data (dict): A dictionary with two keys: 'Product' and 'Price_String'.", "'Product' is a list of product names, each name corresponding to a product.", "'Price_String' is a list of prices in string format, associated with these products.", "The price strings can contain commas for thousand separators and a period for the decimal point (e.g., \"1,234.56\")."], "returns": ["dict: Contains the calculated mean, median, and standard deviation (sample) of the prices.", "The keys are 'mean', 'median', and 'std_dev'.", "matplotlib.axes._subplots.AxesSubplot: A subplot object that represents the histogram plot of the product prices.", "The histogram displays the frequency distribution of the prices."], "reqs": ["pandas", "numpy", "matplotlib"], "raises": [], "example": [">>> results = f_923({'Product': ['Apple', 'Banana'], 'Price_String': ['1,234.00', '567.89']})", ">>> print(results)", "({'mean': 900.9449999999999, 'median': 900.9449999999999, 'std_dev': 471.0108980161712}, (array([1., 1.]), array([ 567.89 , 900.945, 1234. ]), ))"]}} -{"task_id": "f_328", "prompt": "import sqlite3\nimport pandas as pd\n\n\ndef f_328(db_file: str, query: str) -> pd.DataFrame:\n \"\"\"Query an SQLite database and return the results.\n\n This function connects to a given SQLite database, executes a given SQL query,\n and returns the results as a pandas DataFrame.\n\n Parameters:\n - db_file (str): Path to the SQLite database file.\n - query (str): SQL query to execute.\n\n Returns:\n - pd.DataFrame: A DataFrame containing the results of the executed query.\n\n Requirements:\n - sqlite3\n - pandas\n\n Example:\n >>> db_file = 'sample_database.db'\n >>> df = f_328(db_file, \"SELECT * FROM users WHERE name = 'John Doe'\")\n pd.DataFrame:\n id name age\n -- ---------- ---\n .. John Doe ..\n >>> df = f_328(db_file, \"SELECT age, COUNT(*) AS count FROM users GROUP BY age\")\n pd.DataFrame:\n age count\n --- -----\n 25 3\n \"\"\"", "canonical_solution": " with sqlite3.connect(db_file) as conn:\n return pd.read_sql_query(query, conn)", "test": "import unittest\nimport sqlite3\nfrom faker import Faker\nimport os\nclass TestCases(unittest.TestCase):\n fake = Faker()\n specific_names = [\n \"John Doe\",\n \"Jane Smith\",\n \"Alice Brown\",\n \"Bob White\",\n \"Charlie Green\",\n ]\n specific_ages = [25, 30, 35, 40, 45]\n @classmethod\n def setUpClass(cls):\n \"\"\"Set up test data before running tests.\"\"\"\n cls.db_file = cls.generate_test_data_with_file()\n @staticmethod\n def generate_test_data_with_file() -> str:\n \"\"\"Generate test data and save it to a temporary SQLite database file.\"\"\"\n db_file = \"./temp_test_db.sqlite3\"\n if os.path.exists(db_file):\n os.remove(db_file)\n conn = sqlite3.connect(db_file)\n create_table_query = \"\"\"\n CREATE TABLE users (\n id INTEGER PRIMARY KEY,\n name TEXT NOT NULL,\n age INTEGER NOT NULL\n )\n \"\"\"\n conn.execute(create_table_query)\n for _ in range(100):\n name = TestCases.fake.name()\n age = TestCases.fake.random_int(min=20, max=70)\n conn.execute(\"INSERT INTO users (name, age) VALUES (?, ?)\", (name, age))\n for name, age in zip(TestCases.specific_names, TestCases.specific_ages):\n conn.execute(\"INSERT INTO users (name, age) VALUES (?, ?)\", (name, age))\n conn.commit()\n conn.close()\n return db_file\n def test_case_1(self):\n \"\"\"Test fetching all users.\"\"\"\n df = f_328(self.db_file, \"SELECT * FROM users\")\n self.assertEqual(len(df), 100 + len(self.specific_names))\n for name in self.specific_names:\n self.assertIn(name, df[\"name\"].values)\n def test_case_2(self):\n \"\"\"Test fetching specific users based on names.\"\"\"\n names_as_strings = \"', '\".join(self.specific_names)\n df = f_328(\n self.db_file,\n f\"SELECT name, age FROM users WHERE name IN ('{names_as_strings}')\",\n )\n for name in self.specific_names:\n self.assertIn(name, df[\"name\"].values)\n for age in self.specific_ages:\n self.assertIn(age, df[\"age\"].values)\n def test_case_3(self):\n \"\"\"Test fetching users based on age condition.\"\"\"\n age_limit = self.fake.random_int(min=20, max=60)\n df = f_328(self.db_file, f\"SELECT * FROM users WHERE age > {age_limit}\")\n self.assertTrue(all(df[\"age\"] > age_limit))\n def test_case_4(self):\n \"\"\"Test fetching users and sorting by name.\"\"\"\n df = f_328(self.db_file, \"SELECT * FROM users ORDER BY name\")\n sorted_names = sorted(df[\"name\"].tolist())\n self.assertListEqual(df[\"name\"].tolist(), sorted_names)\n def test_case_5(self):\n \"\"\"Test fetching users based on age and sorting by age.\"\"\"\n age_limit = self.fake.random_int(min=20, max=30)\n df = f_328(\n self.db_file,\n f\"SELECT * FROM users WHERE age < {age_limit} ORDER BY age DESC\",\n )\n self.assertTrue(all(df[\"age\"] < age_limit))\n self.assertTrue(\n all(df[\"age\"].iloc[i] >= df[\"age\"].iloc[i + 1] for i in range(len(df) - 1))\n )\n @classmethod\n def tearDownClass(cls):\n \"\"\"Clean up test data after running tests.\"\"\"\n os.remove(cls.db_file)", "apis": ["pandas.DataFrame", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["sqlite3", "pandas"], "doc": {"description": ["Query an SQLite database and return the results.", "This function connects to a given SQLite database, executes a given SQL query,", "and returns the results as a pandas DataFrame."], "note": [], "params": ["db_file (str): Path to the SQLite database file.", "query (str): SQL query to execute."], "returns": ["pd.DataFrame: A DataFrame containing the results of the executed query."], "reqs": ["sqlite3", "pandas"], "raises": [], "example": [">>> db_file = 'sample_database.db'", ">>> df = f_328(db_file, \"SELECT * FROM users WHERE name = 'John Doe'\")", "pd.DataFrame:", "id name age", "-- ---------- ---", ".. John Doe ..", ">>> df = f_328(db_file, \"SELECT age, COUNT(*) AS count FROM users GROUP BY age\")", "pd.DataFrame:", "age count", "--- -----", "25 3"]}} -{"task_id": "f_741", "prompt": "import numpy as np\nimport random\n\ndef f_741(length=10000, seed=0):\n \"\"\"\n Generates a random walk of a specified length. A random walk is a path that consists of a series of random steps\n on some mathematical space. In this case, the steps are either +1 or -1, chosen with equal probability.\n\n Parameters:\n - length (int): The number of steps in the random walk. Must be a non-negative integer. Default is 10000.\n - seed (int, optional): An optional seed value to initialize the random number generator. Use this for reproducible results.\n \n Requirements:\n - numpy\n - random\n \n Returns:\n - np.array: A numpy array representing the positions of the walk at each step. Starts at 0.\n\n Raises:\n - ValueError: If `length` is negative.\n \n Example:\n >>> random.seed(0) # For reproducibility in doctest\n >>> walk = f_741(5)\n >>> walk.tolist()\n [0, 1, 2, 1, 0, 1]\n \"\"\"", "canonical_solution": " if length < 0:\n raise ValueError(\"length must be a non-negative integer\")\n random.seed(seed)\n steps = [1 if random.random() > 0.5 else -1 for _ in range(length)]\n walk = np.cumsum([0] + steps) # Starts at 0\n return walk", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def setUp(self):\n random.seed(42) # Setting seed for reproducibility\n def test_default_length(self):\n walk = f_741(seed=42)\n self.assertEqual(len(walk), 10001) # Includes starting point\n def test_custom_length(self):\n walk = f_741(5000, seed=42)\n self.assertEqual(len(walk), 5001) # Includes starting point\n def test_first_step_zero(self):\n walk = f_741(1, seed=42)\n self.assertEqual(walk[0], 0) # First position should be 0\n def test_negative_length(self):\n with self.assertRaises(ValueError):\n f_741(-1)\n def test_output_type(self):\n walk = f_741(5, seed=42)\n self.assertEqual(walk.tolist(), [0, 1, 0, -1, -2, -1])", "apis": ["numpy.cumsum", "random.seed", "random.random"], "libs": ["numpy", "random"], "doc": {"description": ["Generates a random walk of a specified length. A random walk is a path that consists of a series of random steps", "on some mathematical space. In this case, the steps are either +1 or -1, chosen with equal probability."], "note": [], "params": ["length (int): The number of steps in the random walk. Must be a non-negative integer. Default is 10000.", "seed (int, optional): An optional seed value to initialize the random number generator. Use this for reproducible results."], "returns": ["np.array: A numpy array representing the positions of the walk at each step. Starts at 0."], "reqs": ["numpy", "random"], "raises": ["ValueError: If `length` is negative."], "example": [">>> random.seed(0) # For reproducibility in doctest", ">>> walk = f_741(5)", ">>> walk.tolist()", "[0, 1, 2, 1, 0, 1]"]}} -{"task_id": "f_879", "prompt": "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_879(s1, s2):\n \"\"\"\n Visualize two Series using a swarm plot with a highlight on their intersecting data points.\n\n This function creates a swarm plot to visually compare two pandas Series. \n It highlights the intersection points between these two series by drawing red dashed lines at the intersecting data points.\n\n Parameters:\n - s1 (pd.Series): The first series of data. This series must have a unique name that identifies it in the plot.\n - s2 (pd.Series): The second series of data. Similar to s1, this series must also have a unique name.\n\n Returns:\n - ax (matplotlib.Axes): The Axes object of the plotted swarm chart. This object can be used for further customization of the plot if required.\n intersection_count (int): The number of unique intersecting data points between s1 and s2. \n This count gives a quick numerical summary of the overlap between the two series.\n\n Requirements:\n - pandas\n - seaborn\n - matplotlib\n\n Example:\n >>> s1 = pd.Series([1, 2, 3, 4, 5], name='Series1')\n >>> s2 = pd.Series([4, 5, 6, 7, 8], name='Series2')\n >>> ax, count = f_879(s1, s2)\n >>> ax.get_title()\n 'Overlap Between Series1 and Series2'\n \"\"\"", "canonical_solution": " # Find the intersection data points\n intersection = set(s1).intersection(set(s2))\n\n # Prepare data for visualization\n df1 = pd.DataFrame({s1.name: s1, \"Type\": \"Series1\"})\n df2 = pd.DataFrame({s2.name: s2, \"Type\": \"Series2\"})\n df = pd.concat([df1, df2], axis=0, ignore_index=True)\n\n # Create a swarm plot\n _, ax = plt.subplots(figsize=(10, 6))\n sns.swarmplot(x=df.columns[0], y=\"Type\", data=df, ax=ax)\n\n # Highlight intersection points\n for point in intersection:\n ax.axvline(x=point, color=\"red\", linestyle=\"--\")\n\n ax.set_title(f\"Overlap Between {s1.name} and {s2.name}\")\n\n return ax, len(intersection)", "test": "import pandas as pd\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_879.\"\"\"\n def test_intersection_exists(self):\n \"\"\"Test that the function works when the two series have an intersection.\"\"\"\n s1 = pd.Series([1, 2, 3, 4, 5], name=\"Series1\")\n s2 = pd.Series([4, 5, 6, 7, 8], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 2)\n def test_no_intersection(self):\n \"\"\"Test that the function works when the two series have no intersection.\"\"\"\n s1 = pd.Series([1, 2, 3], name=\"Series1\")\n s2 = pd.Series([4, 5, 6], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 0)\n def test_empty_series(self):\n \"\"\"Test that the function works when one of the series is empty.\"\"\"\n s1 = pd.Series([], name=\"Series1\")\n s2 = pd.Series([], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 0)\n def test_partial_intersection(self):\n \"\"\"Test that the function works when the two series have a partial intersection.\"\"\"\n s1 = pd.Series([1, 2], name=\"Series1\")\n s2 = pd.Series([2, 3], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 1)\n def test_identical_series(self):\n \"\"\"Test that the function works when the two series are identical.\"\"\"\n s1 = pd.Series([1, 2, 3], name=\"Series1\")\n s2 = pd.Series([1, 2, 3], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 3)\n def tearDown(self):\n plt.clf()", "apis": ["pandas.DataFrame", "seaborn.swarmplot", "matplotlib.pyplot.subplots", "pandas.concat"], "libs": ["matplotlib", "pandas", "seaborn"], "doc": {"description": ["Visualize two Series using a swarm plot with a highlight on their intersecting data points.", "This function creates a swarm plot to visually compare two pandas Series.", "It highlights the intersection points between these two series by drawing red dashed lines at the intersecting data points."], "note": [], "params": ["s1 (pd.Series): The first series of data. This series must have a unique name that identifies it in the plot.", "s2 (pd.Series): The second series of data. Similar to s1, this series must also have a unique name."], "returns": ["ax (matplotlib.Axes): The Axes object of the plotted swarm chart. This object can be used for further customization of the plot if required.", "intersection_count (int): The number of unique intersecting data points between s1 and s2.", "This count gives a quick numerical summary of the overlap between the two series."], "reqs": ["pandas", "seaborn", "matplotlib"], "raises": [], "example": [">>> s1 = pd.Series([1, 2, 3, 4, 5], name='Series1')", ">>> s2 = pd.Series([4, 5, 6, 7, 8], name='Series2')", ">>> ax, count = f_879(s1, s2)", ">>> ax.get_title()", "'Overlap Between Series1 and Series2'"]}} -{"task_id": "f_795", "prompt": "import pandas as pd\nimport random\nfrom datetime import datetime\n\n\ndef f_795(\n task_list,\n n_tasks,\n employees=[\"John Doe\", \"Jane Smith\", \"James Brown\", \"Mary Johnson\", \"Robert Davis\"],\n seed=None,\n):\n \"\"\"\n Randomly assigns a specified number of tasks to employees with a due date of the current day\n and returns a DataFrame with these assignments.\n\n Parameters:\n - task_list (list of str): List of tasks to be assigned.\n - n_tasks (int): Number of tasks to be assigned. This number should not be negative.\n - employees (list of str, optional): List of employee names to whom tasks can be assigned.\n If not provided, defaults to: ['John Doe', 'Jane Smith',\n 'James Brown', 'Mary Johnson', 'Robert Davis'].\n - seed (int, optional): Seed for the random number generator to ensure reproducibility. Defaults to None (not set).\n\n Returns:\n - pd.DataFrame: Contains columns 'Task Name', 'Assigned To', and 'Due Date', with each row representing an assigned task.\n\n Raises:\n - ValueError: If n_tasks is negative.\n\n Note:\n - Task names are sanitized by replacing spaces with underscores.\n - Due dates are set to the current system date.\n\n Requirements:\n - pandas\n - random\n - datetime\n\n Examples:\n >>> df = f_795(['Clean Office', 'Prepare Report', 'Client Meeting'], 2, seed=42)\n >>> df\n Task Name Assigned To Due Date\n 0 Client_Meeting John Doe 2024-04-13\n 1 Clean_Office James Brown 2024-04-13\n >>> type(df)\n \n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n if n_tasks < 0:\n raise ValueError(\"n_tasks cannot be negative.\")\n\n assignment_data = []\n for _ in range(n_tasks):\n if not task_list:\n break\n task_name = random.choice(task_list).replace(\" \", \"_\")\n employee = random.choice(employees)\n due_date = datetime.today().strftime(\"%Y-%m-%d\")\n assignment_data.append([task_name, employee, due_date])\n\n assignment_df = pd.DataFrame(\n assignment_data, columns=[\"Task Name\", \"Assigned To\", \"Due Date\"]\n )\n\n return assignment_df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.default_tasks = [\"Task_1\", \"Task_2\", \"Task_3\"]\n self.default_seed = 123\n self.expected_columns = {\"Task Name\", \"Assigned To\", \"Due Date\"}\n self.today_str = datetime.today().strftime(\"%Y-%m-%d\")\n def test_case_1(self):\n # Test basic functionality\n n_tasks = 2\n df = f_795(self.default_tasks, n_tasks, seed=self.default_seed)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(set(df.columns), self.expected_columns)\n self.assertEqual(len(df), n_tasks)\n self.assertTrue(all(df[\"Due Date\"] == self.today_str))\n self.assertTrue(all(\"_\" in name for name in df[\"Task Name\"]))\n def test_case_2(self):\n # List of tasks containing special characters and spaces\n tasks = [\"Task #1\", \"Task @2\", \"Task 3\"]\n n_tasks = 2\n df = f_795(tasks, n_tasks, seed=self.default_seed)\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), self.expected_columns)\n self.assertEqual(len(df), n_tasks)\n def test_case_3(self):\n # Test n_tasks\n for n_tasks in [2, 10, 20, 100]:\n df = f_795(self.default_tasks, n_tasks, seed=self.default_seed)\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), self.expected_columns)\n self.assertEqual(len(df), n_tasks)\n def test_case_4(self):\n # Test error handling - negative tasks\n with self.assertRaises(ValueError):\n f_795(self.default_tasks, -1, seed=self.default_seed)\n def test_case_5(self):\n # Test zero task\n df = f_795(self.default_tasks, 0, seed=self.default_seed)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(set(df.columns), self.expected_columns)\n self.assertEqual(len(df), 0)\n def test_case_6(self):\n # Test empty task list\n df = f_795([], 2, seed=self.default_seed)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(len(df), 0)\n def test_case_7(self):\n # Test custom employee\n custom_employees = [\"Alice\", \"Bob\", \"Charlie\"]\n df = f_795(\n self.default_tasks, 200, employees=custom_employees, seed=self.default_seed\n )\n self.assertTrue(\n all(employee in custom_employees for employee in df[\"Assigned To\"])\n )\n def test_case_8(self):\n # Test random seed\n df1 = f_795(self.default_tasks, 50, seed=0)\n df2 = f_795(self.default_tasks, 50, seed=0)\n df3 = f_795(self.default_tasks, 50, seed=100)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertFalse(df1.equals(df3))\n def test_case_9(self):\n # Test task name with spaces\n tasks = [\"Task One\", \"Task Two\"]\n df = f_795(tasks, 2, seed=42)\n self.assertSetEqual(set(df[\"Task Name\"]), {\"Task_One\", \"Task_Two\"})\n def test_case_10(self):\n # Test task list with duplicates\n tasks = [\"Task\", \"Task\"]\n df = f_795(tasks, 2, seed=42)\n self.assertEqual(len(df), len(tasks))\n self.assertEqual(set(df[\"Task Name\"]), {\"Task\"})", "apis": ["pandas.DataFrame", "datetime.datetime.today", "random.seed", "random.choice"], "libs": ["pandas", "datetime", "random"], "doc": {"description": ["Randomly assigns a specified number of tasks to employees with a due date of the current day", "and returns a DataFrame with these assignments."], "note": ["Task names are sanitized by replacing spaces with underscores.", "Due dates are set to the current system date."], "params": ["task_list (list of str): List of tasks to be assigned.", "n_tasks (int): Number of tasks to be assigned. This number should not be negative.", "employees (list of str, optional): List of employee names to whom tasks can be assigned.", "If not provided, defaults to: ['John Doe', 'Jane Smith',", "'James Brown', 'Mary Johnson', 'Robert Davis'].", "seed (int, optional): Seed for the random number generator to ensure reproducibility. Defaults to None (not set)."], "returns": ["pd.DataFrame: Contains columns 'Task Name', 'Assigned To', and 'Due Date', with each row representing an assigned task."], "reqs": ["pandas", "random", "datetime"], "raises": ["ValueError: If n_tasks is negative."], "example": ["Examples:", ">>> df = f_795(['Clean Office', 'Prepare Report', 'Client Meeting'], 2, seed=42)", ">>> df", "Task Name Assigned To Due Date", "0 Client_Meeting John Doe 2024-04-13", "1 Clean_Office James Brown 2024-04-13", ">>> type(df)", ""]}} -{"task_id": "f_897", "prompt": "import pandas as pd\nfrom sklearn.feature_extraction.text import CountVectorizer\nimport matplotlib.pyplot as plt\n\n# Constants\nSTOP_WORDS = [\"a\", \"an\", \"the\", \"in\", \"on\", \"at\", \"and\", \"or\"]\n\n\ndef f_897(file_path, save_path=None):\n \"\"\"\n Processes a CSV file containing text data and generates a histogram of the ten most common words.\n\n This function reads a CSV file, which is expected to contain a single column of text data. It then splits the text\n into words and creates a histogram of the frequency of the top ten most common words, excluding a predefined set of\n stopwords. The resulting histogram can be either displayed on the screen or saved to a file.\n\n The CSV file should have a single column with the header 'Text'. Each row under this column should contain a text string.\n If the CSV file does not have a header, the first column is assumed to be the text data.\n\n Parameters:\n - file_path (str): The path to the input CSV file.\n - save_path (str, optional): The path where the histogram plot will be saved. If not provided, the plot is displayed on the screen.\n\n Returns:\n - matplotlib.axes.Axes: The Axes object of the plot if save_path is not provided.\n Useful for further customization or display in notebooks.\n - None: If save_path is provided, the plot is saved to the specified path, \n and the function returns None.\n\n Raises:\n - FileNotFoundError: If the specified file_path does not exist. It raises a \n FileNotFoundError with a message indicating the file path that was not found.\n - Exception: For any other errors that occur during the function execution. \n In this case, the error is printed to the console, and None is returned.\n\n Requirements:\n - pandas\n - scikit-learn\n - matplotlib\n\n Notes:\n - The function uses pandas for data manipulation, sklearn's CountVectorizer for text vectorization, and matplotlib for plotting.\n - A predefined list of stopwords is used to filter out common but insignificant words from the histogram.\n\n Examples:\n >>> ax = f_897('text_data.csv')\n >>> print(ax)\n Axes(0.125,0.11;0.775x0.77)\n >>> result = f_897('text_data.csv', 'output_plot.png')\n >>> print(result)\n None\n \"\"\"", "canonical_solution": " try:\n # Reading the CSV file into a DataFrame\n df = pd.read_csv(file_path, usecols=[0], names=[\"Text\"], header=None)\n\n # Vectorizing the text\n vectorizer = CountVectorizer(stop_words=STOP_WORDS)\n word_count = vectorizer.fit_transform(df[\"Text\"].dropna())\n\n # Calculating word frequency\n sum_words = word_count.sum(axis=0)\n words_freq = [\n (word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()\n ]\n words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)\n\n # Preparing data for the top 10 words\n top_words = words_freq[:10]\n df_top = pd.DataFrame(top_words, columns=[\"Word\", \"Count\"])\n\n # Plotting\n ax = df_top.plot.bar(x=\"Word\", y=\"Count\", rot=0, legend=False)\n\n # Saving or displaying the plot\n if save_path:\n plt.savefig(save_path)\n plt.close()\n\n return None if save_path else ax\n\n except FileNotFoundError as exc:\n raise FileNotFoundError(f\"File not found: {file_path}\") from exc\n\n except Exception as e:\n print(f\"An error occurred: {e}\")\n return None", "test": "import unittest\nfrom unittest.mock import patch\nimport matplotlib.pyplot as plt\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_897.\"\"\"\n def tearDown(self):\n \"\"\"Clean up by removing files created during tests.\"\"\"\n plt.close()\n if os.path.exists(\"test_output.png\"):\n os.remove(\"test_output.png\")\n @patch(\"pandas.read_csv\")\n def test_display_plot(self, mock_read_csv):\n \"\"\"\n Test if the function displays a plot correctly when no save path is provided.\n \"\"\"\n # Mock data\n mock_read_csv.return_value = pd.DataFrame(\n {\"Text\": [\"word1 word2 word3\", \"word2 word3 word4\"]}\n )\n # Test\n result = f_897(\"dummy_path.csv\")\n print(result)\n self.assertIsNotNone(result)\n @patch(\"pandas.read_csv\")\n def test_save_plot(self, mock_read_csv):\n \"\"\"\n Test if the function saves a plot correctly when a save path is provided.\n \"\"\"\n # Mock data\n mock_read_csv.return_value = pd.DataFrame(\n {\"Text\": [\"word1 word2 word3\", \"word2 word3 word4\"]}\n )\n # Test\n result = f_897(\"dummy_path.csv\", \"test_output.png\")\n self.assertIsNone(result)\n self.assertTrue(os.path.exists(\"test_output.png\"))\n @patch(\"pandas.read_csv\")\n def test_empty_file(self, mock_read_csv):\n \"\"\"\n Test the function's behavior with an empty file.\n \"\"\"\n # Mock data\n mock_read_csv.return_value = pd.DataFrame({\"Text\": []})\n # Test\n result = f_897(\"dummy_path.csv\")\n self.assertIsNone(result)\n @patch(\"pandas.read_csv\")\n def test_invalid_file_path(self, mock_read_csv):\n \"\"\"\n Test the function's behavior with an invalid file path.\n \"\"\"\n mock_read_csv.side_effect = FileNotFoundError\n # Test\n with self.assertRaises(FileNotFoundError):\n f_897(\"invalid_path.csv\")\n @patch(\"pandas.read_csv\")\n def test_large_data_set(self, mock_read_csv):\n \"\"\"\n Test the function's behavior with a large data set.\n \"\"\"\n # Mock data: Generate a large dataset\n mock_read_csv.return_value = pd.DataFrame(\n {\"Text\": [\"word\" + str(i) for i in range(1000)]}\n )\n # Test\n result = f_897(\"dummy_path.csv\")\n self.assertIsNotNone(result)", "apis": ["pandas.read_csv", "matplotlib.pyplot.close", "sklearn.feature_extraction.text.CountVectorizer", "matplotlib.pyplot.savefig", "pandas.DataFrame"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["Processes a CSV file containing text data and generates a histogram of the ten most common words.", "This function reads a CSV file, which is expected to contain a single column of text data. It then splits the text", "into words and creates a histogram of the frequency of the top ten most common words, excluding a predefined set of", "stopwords. The resulting histogram can be either displayed on the screen or saved to a file.", "The CSV file should have a single column with the header 'Text'. Each row under this column should contain a text string.", "If the CSV file does not have a header, the first column is assumed to be the text data.", "Notes:", "- The function uses pandas for data manipulation, sklearn's CountVectorizer for text vectorization, and matplotlib for plotting.", "- A predefined list of stopwords is used to filter out common but insignificant words from the histogram."], "note": [], "params": ["file_path (str): The path to the input CSV file.", "save_path (str, optional): The path where the histogram plot will be saved. If not provided, the plot is displayed on the screen."], "returns": ["matplotlib.axes.Axes: The Axes object of the plot if save_path is not provided.", "Useful for further customization or display in notebooks.", "None: If save_path is provided, the plot is saved to the specified path,", "and the function returns None."], "reqs": ["pandas", "scikit-learn", "matplotlib"], "raises": ["FileNotFoundError: If the specified file_path does not exist. It raises a", "FileNotFoundError with a message indicating the file path that was not found.", "Exception: For any other errors that occur during the function execution.", "In this case, the error is printed to the console, and None is returned."], "example": ["Examples:", ">>> ax = f_897('text_data.csv')", ">>> print(ax)", "Axes(0.125,0.11;0.775x0.77)", ">>> result = f_897('text_data.csv', 'output_plot.png')", ">>> print(result)", "None"]}} -{"task_id": "f_916", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_916(list_of_lists):\n \"\"\"\n Generate a list of pandas Series objects, where each Series is indexed by the elements of a sub-list from `list_of_lists`.\n Each Series contains unique integers starting from 1 and going up to the length of the respective sub-list. These integers\n are shuffled randomly to create a unique ordering for each Series.\n\n Parameters:\n - list_of_lists (list of list): This parameter is expected to be a list where each element is itself a list.\n These inner lists are used as indices for the Series objects. Each inner list represents the index of one Series.\n\n Returns:\n - series_list (list of pandas.Series): This function returns a list. Each element in this list is a pandas Series object.\n The Series objects are indexed by the elements of the sub-lists provided in `list_of_lists`. The values in each Series\n are unique integers that are randomly shuffled.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n - Here's an example demonstrating how to use this function:\n >>> import numpy as np\n >>> np.random.seed(0) # Setting a seed for reproducibility of the example\n >>> series = f_916([['x', 'y', 'z'], ['a', 'b', 'c']])\n >>> for s in series: print(s)\n x 3\n y 2\n z 1\n dtype: int64\n a 3\n b 1\n c 2\n dtype: int64\n\n Note:\n - The function uses numpy's random shuffle, which modifies the sequence in-place. Therefore, each call to the function\n may produce different Series values unless the random seed is set beforehand.\n \"\"\"", "canonical_solution": " series_list = []\n for sublist in list_of_lists:\n values = np.arange(1, len(sublist) + 1)\n np.random.shuffle(values)\n s = pd.Series(values, index=sublist)\n series_list.append(s)\n\n return series_list", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_916.\"\"\"\n def test_basic_functionality(self):\n \"\"\"Test basic functionality of the function.\"\"\"\n np.random.seed(0)\n input_data = [[\"x\", \"y\", \"z\"], [\"a\", \"b\", \"c\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 2)\n expected_indexes = [[\"x\", \"y\", \"z\"], [\"a\", \"b\", \"c\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])\n def test_different_lengths(self):\n \"\"\"Test with sub-lists of different lengths.\"\"\"\n np.random.seed(1)\n input_data = [[\"m\", \"n\"], [\"p\", \"q\", \"r\", \"s\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 2)\n expected_indexes = [[\"m\", \"n\"], [\"p\", \"q\", \"r\", \"s\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])\n def test_single_element_list(self):\n \"\"\"Test with a single-element sub-list.\"\"\"\n np.random.seed(2)\n input_data = [[\"a\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 1)\n expected_indexes = [[\"a\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])\n def test_mixed_lengths(self):\n \"\"\"Test with sub-lists of different lengths.\"\"\"\n np.random.seed(3)\n input_data = [[\"x\", \"y\", \"z\"], [\"a\", \"b\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 2)\n expected_indexes = [[\"x\", \"y\", \"z\"], [\"a\", \"b\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])\n def test_multiple_series(self):\n \"\"\"Test with multiple sub-lists.\"\"\"\n np.random.seed(4)\n input_data = [[\"x\", \"y\"], [\"a\", \"b\"], [\"m\", \"n\", \"o\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 3)\n expected_indexes = [[\"x\", \"y\"], [\"a\", \"b\"], [\"m\", \"n\", \"o\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])", "apis": ["numpy.arange", "pandas.Series", "numpy.random", "numpy.random.shuffle"], "libs": ["numpy", "pandas"], "doc": {"description": ["Generate a list of pandas Series objects, where each Series is indexed by the elements of a sub-list from `list_of_lists`.", "Each Series contains unique integers starting from 1 and going up to the length of the respective sub-list. These integers", "are shuffled randomly to create a unique ordering for each Series."], "note": ["The function uses numpy's random shuffle, which modifies the sequence in-place. Therefore, each call to the function", "may produce different Series values unless the random seed is set beforehand."], "params": ["list_of_lists (list of list): This parameter is expected to be a list where each element is itself a list.", "These inner lists are used as indices for the Series objects. Each inner list represents the index of one Series."], "returns": ["series_list (list of pandas.Series): This function returns a list. Each element in this list is a pandas Series object.", "The Series objects are indexed by the elements of the sub-lists provided in `list_of_lists`. The values in each Series", "are unique integers that are randomly shuffled."], "reqs": ["pandas", "numpy"], "raises": [], "example": ["- Here's an example demonstrating how to use this function:", ">>> import numpy as np", ">>> np.random.seed(0) # Setting a seed for reproducibility of the example", ">>> series = f_916([['x', 'y', 'z'], ['a', 'b', 'c']])", ">>> for s in series: print(s)", "x 3", "y 2", "z 1", "dtype: int64", "a 3", "b 1", "c 2", "dtype: int64"]}} -{"task_id": "f_765", "prompt": "import pandas as pd\nimport random\nimport re\n\ndef f_765(person_names, email_domains, num_records=5):\n \"\"\"\n Generate a DataFrame with a specified number of records containing personal names and emails. \n The emails are cleaned by replacing all occurrences of \"@\" with \"[at]\".\n \n Parameters:\n - person_names (list of str): A list of person names to use in the records.\n - email_domains (list of str): A list of email domains to use in the records.\n - num_records (int, optional): The number of records to generate. Default is 5.\n \n Returns:\n - DataFrame: A pandas DataFrame with columns 'Name' and 'Email' containing the person names and cleaned emails.\n \n Requirements:\n - pandas for DataFrame manipulation\n - random for random selection\n - re for regular expression operations\n \n Raises:\n - ValueError: If the number of names provided is less than the number of records requested or if no email domains are provided.\n \n Example:\n >>> f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2)\n Name Email\n 0 John Doe john[at]yahoo.com\n 1 Jane Smith jane[at]gmail.com\n >>> f_765(['Alice'], ['outlook.com'], 1)\n Name Email\n 0 Alice alice[at]outlook.com\n \"\"\"", "canonical_solution": " if len(person_names) < num_records or len(email_domains) == 0:\n raise ValueError(\"Insufficient number of names or domains provided.\")\n \n data = []\n \n # Randomly select 'num_records' names from the provided list\n selected_names = random.sample(person_names, num_records)\n\n for name in selected_names:\n email = re.sub('@', '[at]', '{}@{}'.format(name.split()[0].lower(), random.choice(email_domains)))\n data.append([name, email])\n\n df = pd.DataFrame(data, columns=['Name', 'Email'])\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n random.seed(0) # Initialize random seed\n result_df = f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2)\n self.assertTrue(isinstance(result_df, pd.DataFrame))\n self.assertEqual(len(result_df), 2)\n self.assertTrue(set(result_df.columns) == {'Name', 'Email'})\n self.assertTrue(all(result_df['Email'].str.contains('[at]')))\n \n def test_case_2(self):\n random.seed(0) # Initialize random seed\n result_df = f_765(['Alice'], ['outlook.com'], 1)\n self.assertTrue(isinstance(result_df, pd.DataFrame))\n self.assertEqual(len(result_df), 1)\n self.assertTrue(set(result_df.columns) == {'Name', 'Email'})\n self.assertTrue(all(result_df['Email'].str.contains('[at]')))\n \n def test_case_3(self):\n random.seed(0) # Initialize random seed\n with self.assertRaises(ValueError):\n f_765(['John Doe'], ['gmail.com'], 2)\n \n def test_case_4(self):\n random.seed(0) # Initialize random seed\n with self.assertRaises(ValueError):\n f_765(['John Doe', 'Jane Smith'], [], 2)\n \n def test_case_5(self):\n random.seed(0) # Initialize random seed\n result_df = f_765(['John Doe', 'Jane Smith', 'Bob'], ['gmail.com', 'yahoo.com'], 3)\n self.assertTrue(isinstance(result_df, pd.DataFrame))\n self.assertEqual(len(result_df), 3)\n self.assertTrue(set(result_df.columns) == {'Name', 'Email'})\n self.assertTrue(all(result_df['Email'].str.contains('[at]')))", "apis": ["pandas.DataFrame", "random.sample", "re.sub", "random.choice"], "libs": ["pandas", "re", "random"], "doc": {"description": ["Generate a DataFrame with a specified number of records containing personal names and emails.", "The emails are cleaned by replacing all occurrences of \"@\" with \"[at]\"."], "note": [], "params": ["person_names (list of str): A list of person names to use in the records.", "email_domains (list of str): A list of email domains to use in the records.", "num_records (int, optional): The number of records to generate. Default is 5."], "returns": ["DataFrame: A pandas DataFrame with columns 'Name' and 'Email' containing the person names and cleaned emails."], "reqs": ["pandas for DataFrame manipulation", "random for random selection", "re for regular expression operations"], "raises": ["ValueError: If the number of names provided is less than the number of records requested or if no email domains are provided."], "example": [">>> f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2)", "Name Email", "0 John Doe john[at]yahoo.com", "1 Jane Smith jane[at]gmail.com", ">>> f_765(['Alice'], ['outlook.com'], 1)", "Name Email", "0 Alice alice[at]outlook.com"]}} -{"task_id": "f_400", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_400(column, data):\n \"\"\"\n Analyze a list of sales data, calculate the sum, the mean, the minimum, the maximum of a given column,\n and return the bar chart plot for the given column without displaying it.\n\n Parameters:\n column (str): The column to analyze. Expected values are ['Product', 'Quantity Sold', 'Total Sales'].\n data (list): The sales data. Expected format: [['Product Name', Quantity Sold (int), Total Sales (int)], ...]\n The function checks for data validity in the quantity columns (must not be negative).\n\n Returns:\n tuple: A tuple containing:\n - dict: A dictionary with the sum, mean, min, max of the column.\n - matplotlib.axes.Axes: The Axes object of the plotted bar chart. The bar chart will have Product in its\n x-axis and the title Bar Chart of (column).\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> data = [['Product A', 100, 10000], ['Product B', 150, 15000], ['Product C', 200, 20000]]\n >>> stats, plot = f_400('Total Sales', data)\n >>> stats\n {'sum': 45000, 'mean': 15000.0, 'min': 10000, 'max': 20000}\n >>> plot\n \n \"\"\"", "canonical_solution": " COLUMNS = [\"Product\", \"Quantity Sold\", \"Total Sales\"]\n df = pd.DataFrame(data, columns=COLUMNS)\n if (df[\"Quantity Sold\"] < 0).any() or (df[\"Total Sales\"] < 0).any():\n raise ValueError(\"Value must not be negative\")\n column_data = df[column]\n\n result = {\n \"sum\": np.sum(column_data),\n \"mean\": np.mean(column_data),\n \"min\": np.min(column_data),\n \"max\": np.max(column_data),\n }\n\n ax = df.plot.bar(x=\"Product\", y=column, title=f\"Bar Chart of {column}\")\n\n return result, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test total sales\n scenarios = [\n (\n [\n [\"Product A\", 100, 10000],\n [\"Product B\", 150, 15000],\n [\"Product C\", 200, 20000],\n ],\n {\"sum\": 45000, \"mean\": 15000.0, \"min\": 10000, \"max\": 20000},\n ),\n (\n [\n [\"Product A\", 10, 1000],\n [\"Product B\", 20, 2000],\n [\"Product C\", 30, 3000],\n [\"Product D\", 40, 4000],\n ],\n {\"sum\": 10000, \"mean\": 2500.0, \"min\": 1000, \"max\": 4000},\n ),\n (\n [[\"Product A\", 5, 500]],\n {\"sum\": 500, \"mean\": 500.0, \"min\": 500, \"max\": 500},\n ),\n ]\n for data, expected in scenarios:\n with self.subTest(data=data):\n stats, ax = f_400(\"Total Sales\", data)\n self.assertDictEqual(stats, expected)\n self.assertEqual(ax.get_title(), \"Bar Chart of Total Sales\")\n plt.close(\"all\")\n def test_case_2(self):\n # Test quantity sold\n scenarios = [\n (\n [\n [\"Product A\", 100, 5000],\n [\"Product B\", 200, 6000],\n [\"Product C\", 300, 7000],\n ],\n {\"sum\": 600, \"mean\": 200.0, \"min\": 100, \"max\": 300},\n ),\n (\n [\n [\"Product A\", 5, 500],\n [\"Product B\", 10, 1000],\n [\"Product C\", 15, 1500],\n [\"Product D\", 20, 2000],\n [\"Product E\", 25, 2500],\n ],\n {\"sum\": 75, \"mean\": 15.0, \"min\": 5, \"max\": 25},\n ),\n ]\n for data, expected in scenarios:\n with self.subTest(data=data):\n stats, ax = f_400(\"Quantity Sold\", data)\n self.assertDictEqual(stats, expected)\n self.assertEqual(ax.get_title(), \"Bar Chart of Quantity Sold\")\n plt.close(\"all\")\n def test_case_3(self):\n # Test error handling - invalid column\n with self.assertRaises(KeyError):\n f_400(\"Invalid Column\", [[\"Product A\", 100, 10000]])\n def test_case_4(self):\n # Test error handling - empty data and negative values\n with self.assertRaises(Exception):\n f_400(\"Total Sales\", [])\n with self.assertRaises(Exception):\n f_400(\"Total Sales\", [[\"Product A\", -100, -10000]])\n def test_case_5(self):\n # Test plot data integrity\n data = [[\"Product A\", 100, 5000], [\"Product B\", 200, 10000]]\n _, ax = f_400(\"Quantity Sold\", data)\n bars = [rect.get_height() for rect in ax.patches]\n expected_bars = [100, 200]\n self.assertEqual(bars, expected_bars)\n plt.close(\"all\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.mean", "numpy.min", "numpy.max", "numpy.sum", "pandas.DataFrame"], "libs": ["numpy", "pandas"], "doc": {"description": ["Analyze a list of sales data, calculate the sum, the mean, the minimum, the maximum of a given column,", "and return the bar chart plot for the given column without displaying it."], "note": [], "params": ["column (str): The column to analyze. Expected values are ['Product', 'Quantity Sold', 'Total Sales'].", "data (list): The sales data. Expected format: [['Product Name', Quantity Sold (int), Total Sales (int)], ...]", "The function checks for data validity in the quantity columns (must not be negative)."], "returns": ["tuple: A tuple containing:", "dict: A dictionary with the sum, mean, min, max of the column.", "matplotlib.axes.Axes: The Axes object of the plotted bar chart. The bar chart will have Product in its", "x-axis and the title Bar Chart of (column)."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> data = [['Product A', 100, 10000], ['Product B', 150, 15000], ['Product C', 200, 20000]]", ">>> stats, plot = f_400('Total Sales', data)", ">>> stats", "{'sum': 45000, 'mean': 15000.0, 'min': 10000, 'max': 20000}", ">>> plot", ""]}} -{"task_id": "f_610", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\n\ndef f_610(data_path):\n \"\"\"\n Normalizes a dataset from a .csv file.\n \n Parameters:\n - data_path (str): The path to the csv data file.\n\n Returns:\n - df (DataFrame): The normalized dataset.\n\n Requirements:\n - pandas\n - sklearn\n \n Example:\n >>> df = f_610('path_to_data_file.csv')\n \"\"\"", "canonical_solution": " df = pd.read_csv(data_path)\n data = df.to_numpy()\n \n scaler = MinMaxScaler()\n data = scaler.fit_transform(data)\n\n df = pd.DataFrame(data, columns=df.columns)\n\n return df", "test": "import unittest\nimport os\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Create data\n data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (3, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 1)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 1)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 1)\n # Remove data\n os.remove('data.csv')\n def test_case_2(self):\n # Create data\n data = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (3, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 0)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 0)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 0)\n # Remove data\n os.remove('data.csv')\n def test_case_3(self):\n # Create data\n data = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (3, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 0)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 0)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 0)\n # Remove data\n os.remove('data.csv')\n def test_case_4(self):\n # Create data\n data = np.array([[3, 2, 1], [6, 5, 4], [9, 8, 7]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (3, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 1)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 1)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 1)\n # Remove data\n os.remove('data.csv')\n def test_case_5(self):\n # Create data\n data = np.array([[1, 2, 3], [4, 5, 6]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (2, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 1)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 1)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 1)\n # Remove data\n os.remove('data.csv')", "apis": ["sklearn.preprocessing.MinMaxScaler", "pandas.read_csv", "pandas.DataFrame"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Normalizes a dataset from a .csv file."], "note": [], "params": ["data_path (str): The path to the csv data file."], "returns": ["df (DataFrame): The normalized dataset."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> df = f_610('path_to_data_file.csv')"]}} -{"task_id": "f_343", "prompt": "import pickle\nimport os\nimport matplotlib.pyplot as plt\n\n\ndef f_343(numbers, file_path=\"save.pkl\"):\n \"\"\"\n Save a Matplotlib image generated from the provided \"numbers\" list in a pickle file.\n The function then reads the image back from the file for validation and deletes the pickle file afterward.\n\n Parameters:\n - numbers (list): List of int/float values used to generate the matplotlib figure.\n - file_path (str): Path to temporary pickle file. Defaults to 'save.pkl'.\n\n Returns:\n - loaded_fig (matplotlib.figure.Figure): The loaded matplotlib figure from file_path.\n\n Requirements:\n - pickle\n - os\n - matplotlib.pyplot\n\n Example:\n >>> numbers = [random.random() for _ in range(100)]\n >>> loaded_fig = f_343(numbers)\n >>> type(loaded_fig)\n \n \"\"\"", "canonical_solution": "\n if not isinstance(numbers, list) or not all(\n isinstance(item, (int, float)) for item in numbers\n ):\n raise TypeError(\"Expect list of numbers.\")\n\n fig = plt.figure()\n plt.plot(numbers)\n\n with open(file_path, \"wb\") as file:\n pickle.dump(fig, file)\n\n with open(file_path, \"rb\") as file:\n loaded_fig = pickle.load(file)\n\n os.remove(file_path)\n\n return loaded_fig", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport tempfile\nimport os\nimport random\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n random.seed(0)\n def test_case_1(self):\n # Test default case - correct file was generated & correct removal\n numbers = list(range(10))\n loaded_fig = f_343(numbers)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(\"save.pkl\"), \"Pickle file was not deleted.\")\n def test_case_2(self):\n # Test when saving intermediate file to specified location\n numbers = list(range(10))\n path = os.path.join(self.temp_dir.name, \"default.pkl\")\n loaded_fig = f_343(numbers, path)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(path), \"Pickle file was not deleted.\")\n def test_case_3(self):\n # Test with floats\n numbers = [random.random() for _ in range(10)]\n loaded_fig = f_343(numbers)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(\"save.pkl\"), \"Pickle file was not deleted.\")\n def test_case_4(self):\n # Test with a mix of positive, negative, integer, and floating numbers\n numbers = [1, -1, 2.5, -2.5, 3, -3, 4.5, -4.5]\n loaded_fig = f_343(numbers)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(\"save.pkl\"), \"Pickle file was not deleted.\")\n def test_case_5(self):\n # Test with an empty list\n numbers = []\n loaded_fig = f_343(numbers)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(\"save.pkl\"), \"Pickle file was not deleted.\")\n def test_case_6(self):\n # Function should fail when there's invalid input\n with self.assertRaises(TypeError):\n f_343(\"123\")\n with self.assertRaises(TypeError):\n f_343([\"1\", \"2\", \"3\"])\n with self.assertRaises(TypeError):\n f_343([None, None, None])\n def tearDown(self):\n plt.close(\"all\")\n self.temp_dir.cleanup()", "apis": ["pickle.dump", "matplotlib.pyplot.plot", "matplotlib.pyplot.figure", "pickle.load", "os.remove"], "libs": ["matplotlib", "pickle", "os"], "doc": {"description": ["Save a Matplotlib image generated from the provided \"numbers\" list in a pickle file.", "The function then reads the image back from the file for validation and deletes the pickle file afterward."], "note": [], "params": ["numbers (list): List of int/float values used to generate the matplotlib figure.", "file_path (str): Path to temporary pickle file. Defaults to 'save.pkl'."], "returns": ["loaded_fig (matplotlib.figure.Figure): The loaded matplotlib figure from file_path."], "reqs": ["pickle", "os", "matplotlib.pyplot"], "raises": [], "example": [">>> numbers = [random.random() for _ in range(100)]", ">>> loaded_fig = f_343(numbers)", ">>> type(loaded_fig)", ""]}} -{"task_id": "f_748", "prompt": "import zipfile\nimport os\nimport re\nimport shutil\n\ndef f_748(source_dir: str, target_dir: str, archive_name: str = 'archive.zip') -> str:\n \"\"\"\n Archives all processed files from a source directory to a target directory.\n The function identifies processed files by the '_processed' suffix in the filename.\n\n Parameters:\n source_dir (str): The directory containing the files to be archived.\n target_dir (str): The directory where the archive will be saved.\n archive_name (str): The name of the archive file. Default is 'archive.zip'.\n\n Returns:\n str: The path to the created archive.\n\n Requirements:\n - os\n - re\n - shutil\n - zipfile\n\n Example:\n >>> f_748('./data/', './data_processed/')\n './data_processed/archive.zip'\n >>> f_748('./data/', './data_processed/', 'my_archive.zip')\n './data_processed/my_archive.zip'\n \"\"\"", "canonical_solution": " \n # Create directories if they don't exist\n os.makedirs(source_dir, exist_ok=True)\n os.makedirs(target_dir, exist_ok=True)\n \n archive_path = os.path.join(target_dir, archive_name)\n \n with zipfile.ZipFile(archive_path, 'w') as archive:\n for file in os.listdir(source_dir):\n if re.search(r'_processed$', os.path.splitext(file)[0]):\n archive.write(os.path.join(source_dir, file), arcname=file)\n shutil.move(os.path.join(source_dir, file), target_dir)\n \n return archive_path", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Setup test directories\n self.source_dir = 'f_748_data_wenhao/'\n self.target_dir = 'f_748_data_wenhao_target/'\n \n # Remove any existing test directories to start fresh\n if os.path.exists(self.source_dir):\n shutil.rmtree(self.source_dir)\n if os.path.exists(self.target_dir):\n shutil.rmtree(self.target_dir)\n # Create new test directories\n os.makedirs(self.source_dir)\n os.makedirs(self.target_dir)\n def tearDown(self):\n # Clean up test directories after each test case\n if os.path.exists(self.source_dir):\n shutil.rmtree(self.source_dir)\n if os.path.exists(self.target_dir):\n shutil.rmtree(self.target_dir)\n \n def test_case_1(self):\n # Create some test files in the source directory, some with '_processed' suffix\n test_files = ['file1.txt', 'file2_processed.txt']\n for file in test_files:\n with open(os.path.join(self.source_dir, file), 'w') as f:\n f.write(f\"This is {file}\")\n \n # Archive processed files\n archive_path = f_748(self.source_dir, self.target_dir)\n \n # Check if the archive contains the correct file\n with zipfile.ZipFile(archive_path, 'r') as archive:\n self.assertIn('file2_processed.txt', archive.namelist())\n \n def test_case_2(self):\n # Create some test files in the source directory without '_processed' suffix\n test_files = ['file1.txt', 'file3.txt']\n for file in test_files:\n with open(os.path.join(self.source_dir, file), 'w') as f:\n f.write(f\"This is {file}\")\n \n # Archive processed files\n archive_path = f_748(self.source_dir, self.target_dir)\n \n # Check if the archive is empty\n with zipfile.ZipFile(archive_path, 'r') as archive:\n self.assertEqual(len(archive.namelist()), 0)\n \n def test_case_3(self):\n # Source directory is empty\n archive_path = f_748(self.source_dir, self.target_dir)\n \n # Check if the archive is empty\n with zipfile.ZipFile(archive_path, 'r') as archive:\n self.assertEqual(len(archive.namelist()), 0)\n def test_case_4(self):\n # Create some test files in the source directory, some with '_processed' suffix\n test_files = ['file1.txt', 'file2_processed.txt']\n for file in test_files:\n with open(os.path.join(self.source_dir, file), 'w') as f:\n f.write(f\"This is {file}\")\n \n # Archive processed files with a custom archive name\n custom_archive_name = 'custom_archive.zip'\n archive_path = f_748(self.source_dir, self.target_dir, custom_archive_name)\n \n # Check if the custom archive name is used\n self.assertTrue(custom_archive_name in archive_path)\n \n def test_case_5(self):\n # Check the return value for correct archive path\n archive_path = f_748(self.source_dir, self.target_dir)\n expected_path = os.path.join(self.target_dir, 'archive.zip')\n self.assertEqual(archive_path, expected_path)", "apis": ["re.search", "os.listdir", "os.path.splitext", "zipfile.ZipFile", "shutil.move", "os.path", "os.makedirs", "os.path.join"], "libs": ["re", "zipfile", "os", "shutil"], "doc": {"description": ["Archives all processed files from a source directory to a target directory.", "The function identifies processed files by the '_processed' suffix in the filename."], "note": [], "params": ["source_dir (str): The directory containing the files to be archived.", "target_dir (str): The directory where the archive will be saved.", "archive_name (str): The name of the archive file. Default is 'archive.zip'."], "returns": ["str: The path to the created archive."], "reqs": ["os", "re", "shutil", "zipfile"], "raises": [], "example": [">>> f_748('./data/', './data_processed/')", "'./data_processed/archive.zip'", ">>> f_748('./data/', './data_processed/', 'my_archive.zip')", "'./data_processed/my_archive.zip'"]}} -{"task_id": "f_787", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\ndef f_787(start_date='2016-01-01', periods=13, freq='WOM-2FRI', seed=0):\n \"\"\"\n Generate a share price series for a specific period of time, plot the share prices, and return the DataFrame and the plot on the share prices over the given date range.\n\n Parameters:\n - start_date (str): The start date for the share price series in 'YYYY-MM-DD' format. Default is '2016-01-01'.\n - periods (int): The number of periods for which the share price needs to be generated. Default is 13.\n - freq (str): The frequency string conforming to pandas date offset aliases. Default is 'WOM-2FRI'.\n - seed (int, optional): The seed for the random number generator to ensure reproducibility. Default is None.\n\n Returns:\n - A tuple containing a pandas DataFrame with columns ['Date', 'Price'] and a Matplotlib Axes object for the plot.\n\n Examples:\n >>> df, ax = f_787('2020-01-01', 5, 'M', seed=42)\n >>> len(df)\n 5\n >>> df.iloc[0]['Price']\n 249.81604753894499\n >>> ax.title.get_text()\n 'Stock Prices'\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n date_range = pd.date_range(start=start_date, periods=periods, freq=freq)\n stock_prices = np.random.uniform(low=100, high=500, size=periods)\n\n prices_df = pd.DataFrame({'Date': date_range, 'Price': stock_prices})\n prices_df.set_index('Date', inplace=True)\n\n fig, ax = plt.subplots(figsize=(10, 6))\n # ax.plot(prices_df.index, prices_df['Price'], marker='o')\n prices_df.plot(ax=ax, marker='o')\n pd.plotting.register_matplotlib_converters()\n ax.set_title('Stock Prices')\n ax.set_xlabel('Date')\n ax.set_ylabel('Price')\n ax.grid(True)\n \n return prices_df, ax", "test": "import unittest\nimport pandas as pd\nfrom pandas.tseries.frequencies import to_offset\nfrom matplotlib import axes\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \n def test_default_parameters(self):\n df, ax = f_787(seed=42)\n self.assertIsInstance(df, pd.DataFrame, \"The output should be a pandas DataFrame\")\n self.assertIsInstance(ax, axes.Axes, \"The output should be a Matplotlib Axes object\")\n self.assertEqual(len(df), 13, \"DataFrame should contain 13 rows by default\")\n self.assertTrue((100 <= df['Price']).all() and (df['Price'] <= 500).all(), \"Stock prices should be between 100 and 500\")\n self.assertEqual(ax.title.get_text(), 'Stock Prices', \"Plot title should be 'Stock Prices'\")\n \n def test_specified_parameters(self):\n df, ax = f_787('2021-01-01', 5, 'M', seed=42)\n self.assertEqual(len(df), 5, \"DataFrame should contain 5 rows\")\n self.assertTrue((100 <= df['Price']).all() and (df['Price'] <= 500).all(), \"Stock prices should be between 100 and 500\")\n \n def test_business_day_frequency(self):\n df, ax = f_787('2021-01-01', 5, 'B', seed=42)\n self.assertEqual(len(df), 5, \"DataFrame should contain 5 rows\")\n \n def test_weekly_frequency_more_periods(self):\n df, ax = f_787('2021-01-01', 20, 'W', seed=42)\n self.assertEqual(len(df), 20, \"DataFrame should contain 20 rows\")\n \n def test_different_year(self):\n df, ax = f_787('2019-01-01', 10, 'W', seed=42)\n self.assertEqual", "apis": ["pandas.plotting", "numpy.random", "numpy.random.uniform", "numpy.random.seed", "pandas.date_range", "pandas.plotting.register_matplotlib_converters", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["numpy", "pandas", "matplotlib"], "doc": {"description": ["Generate a share price series for a specific period of time, plot the share prices, and return the DataFrame and the plot on the share prices over the given date range."], "note": [], "params": ["start_date (str): The start date for the share price series in 'YYYY-MM-DD' format. Default is '2016-01-01'.", "periods (int): The number of periods for which the share price needs to be generated. Default is 13.", "freq (str): The frequency string conforming to pandas date offset aliases. Default is 'WOM-2FRI'.", "seed (int, optional): The seed for the random number generator to ensure reproducibility. Default is None."], "returns": ["A tuple containing a pandas DataFrame with columns ['Date', 'Price'] and a Matplotlib Axes object for the plot."], "reqs": [], "raises": [], "example": ["Examples:", ">>> df, ax = f_787('2020-01-01', 5, 'M', seed=42)", ">>> len(df)", "5", ">>> df.iloc[0]['Price']", "249.81604753894499", ">>> ax.title.get_text()", "'Stock Prices'"]}} +{"task_id": "f_923", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_923(data):\n \"\"\"\n Processes a dictionary containing product names and their corresponding prices in string format. \n The function converts these string prices (which may include commas as thousand separators) into float values. \n It then calculates statistical measures (mean, median, and standard deviation) of these prices and \n generates a histogram to visually represent the distribution of the prices.\n\n Parameters:\n - data (dict): A dictionary with two keys: 'Product' and 'Price_String'. \n 'Product' is a list of product names, each name corresponding to a product.\n 'Price_String' is a list of prices in string format, associated with these products. \n The price strings can contain commas for thousand separators and a period for the decimal point (e.g., \"1,234.56\").\n\n Returns:\n - dict: Contains the calculated mean, median, and standard deviation (sample) of the prices. \n The keys are 'mean', 'median', and 'std_dev'.\n - matplotlib.axes._subplots.Axes: A subplot object that represents the histogram plot of the product prices. \n The histogram displays the frequency distribution of the prices.\n\n Note:\n - A histogram plot is generated using these prices, with automatic bin sizing ('auto'), a blue color, \n 70% opacity (alpha=0.7), and a relative width (rwidth) of 0.85 for the bars. \n - The histogram's title is set to 'Histogram of Product Prices', and the x and y-axis are labeled 'Price' and 'Frequency', respectively.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib\n\n Example:\n >>> results = f_923({'Product': ['Apple', 'Banana'], 'Price_String': ['1,234.00', '567.89']})\n >>> print(results)\n ({'mean': 900.9449999999999, 'median': 900.9449999999999, 'std_dev': 471.0108980161712}, (array([1., 1.]), array([ 567.89 , 900.945, 1234. ]), ))\n\n Note:\n - The function assumes that each product name in the 'Product' list has a corresponding price in the 'Price_String' list.\n - The histogram plot's appearance (like color, alpha, and rwidth) is pre-set but can be customized further if needed.\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n # Correctly convert string prices to float, accounting for commas\n df[\"Price_Float\"] = df[\"Price_String\"].apply(lambda x: float(x.replace(\",\", \"\")))\n\n mean_price = np.mean(df[\"Price_Float\"])\n median_price = np.median(df[\"Price_Float\"])\n # Use ddof=1 for sample standard deviation\n std_dev_price = np.std(df[\"Price_Float\"], ddof=1)\n\n # Histogram plot settings can be refined for better visualization\n ax = plt.hist(df[\"Price_Float\"], bins=\"auto\", color=\"blue\", alpha=0.7, rwidth=0.85)\n plt.title(\"Histogram of Product Prices\")\n plt.xlabel(\"Price\")\n plt.ylabel(\"Frequency\")\n\n return {\"mean\": mean_price, \"median\": median_price, \"std_dev\": std_dev_price}, ax", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_923\"\"\"\n def test_basic_functionality(self):\n \"\"\"Test basic functionality.\"\"\"\n sample_data = {\n \"Product\": [\"James\", \"Olivia\", \"Jamie\", \"Angela\", \"Jennifer\"],\n \"Price_String\": [\"2,213.00\", \"6,083.00\", \"5,461.00\", \"884.00\", \"2,783.00\"],\n }\n float_prices = [\n float(price.replace(\",\", \"\")) for price in sample_data[\"Price_String\"]\n ]\n expected_mean = np.mean(float_prices)\n expected_median = np.median(float_prices)\n expected_std_dev = np.std(float_prices, ddof=1)\n result, _ = f_923(sample_data)\n self.assertAlmostEqual(result[\"mean\"], expected_mean)\n self.assertAlmostEqual(result[\"median\"], expected_median)\n self.assertAlmostEqual(result[\"std_dev\"], expected_std_dev)\n def test_large_sample_size(self):\n \"\"\"Test large sample size.\"\"\"\n sample_data = {\n \"Product\": [\n \"Adam\",\n \"Lisa\",\n \"Scott\",\n \"Bianca\",\n \"Ashlee\",\n \"Shannon\",\n \"Michelle\",\n \"Robert\",\n \"Joseph\",\n \"Joshua\",\n \"Traci\",\n \"Jacob\",\n \"Daniel\",\n \"Timothy\",\n \"Paul\",\n ],\n \"Price_String\": [\n \"1,691.00\",\n \"967.00\",\n \"5,789.00\",\n \"6,806.00\",\n \"3,301.00\",\n \"5,319.00\",\n \"7,619.00\",\n \"134.00\",\n \"7,883.00\",\n \"5,028.00\",\n \"3,330.00\",\n \"5,253.00\",\n \"8,551.00\",\n \"1,631.00\",\n \"7,637.00\",\n ],\n }\n float_prices = [\n float(price.replace(\",\", \"\")) for price in sample_data[\"Price_String\"]\n ]\n expected_mean = np.mean(float_prices)\n expected_median = np.median(float_prices)\n expected_std_dev = np.std(float_prices, ddof=1)\n result, _ = f_923(sample_data)\n self.assertAlmostEqual(result[\"mean\"], expected_mean)\n self.assertAlmostEqual(result[\"median\"], expected_median)\n self.assertAlmostEqual(result[\"std_dev\"], expected_std_dev)\n def test_invalid_input(self):\n \"\"\"Test invalid input.\"\"\"\n with self.assertRaises(Exception):\n f_923({})\n with self.assertRaises(Exception):\n f_923({\"Product\": [\"Apple\"], \"Price_WrongKey\": [\"1,234.00\"]})\n def test_all_zero_prices(self):\n \"\"\"Test all zero prices.\"\"\"\n sample_data = {\n \"Product\": [\"Apple\", \"Banana\", \"Cherry\"],\n \"Price_String\": [\"0.00\", \"0.00\", \"0.00\"],\n }\n result, _ = f_923(sample_data)\n self.assertEqual(result[\"mean\"], 0)\n self.assertEqual(result[\"median\"], 0)\n self.assertEqual(result[\"std_dev\"], 0)\n def test_non_uniform_distribution(self):\n \"\"\"Test non-uniform distribution.\"\"\"\n sample_data = {\n \"Product\": [\"Apple\", \"Banana\", \"Cherry\", \"Date\", \"Fig\"],\n \"Price_String\": [\"1,000.00\", \"500.00\", \"1,500.00\", \"2,000.00\", \"2,500.00\"],\n }\n float_prices = [\n float(price.replace(\",\", \"\")) for price in sample_data[\"Price_String\"]\n ]\n expected_mean = np.mean(float_prices)\n expected_median = np.median(float_prices)\n expected_std_dev = np.std(float_prices, ddof=1)\n result, _ = f_923(sample_data)\n self.assertAlmostEqual(result[\"mean\"], expected_mean)\n self.assertAlmostEqual(result[\"median\"], expected_median)\n self.assertAlmostEqual(result[\"std_dev\"], expected_std_dev)\n def tearDown(self):\n plt.close()", "apis": ["pandas.DataFrame", "matplotlib.pyplot.hist", "numpy.mean", "matplotlib.pyplot.title", "numpy.median", "matplotlib.pyplot.ylabel", "numpy.std", "matplotlib.pyplot.xlabel"], "libs": ["numpy", "pandas", "matplotlib"], "doc": {"description": ["Processes a dictionary containing product names and their corresponding prices in string format.", "The function converts these string prices (which may include commas as thousand separators) into float values.", "It then calculates statistical measures (mean, median, and standard deviation) of these prices and", "generates a histogram to visually represent the distribution of the prices."], "note": ["A histogram plot is generated using these prices, with automatic bin sizing ('auto'), a blue color,", "70% opacity (alpha=0.7), and a relative width (rwidth) of 0.85 for the bars.", "The histogram's title is set to 'Histogram of Product Prices', and the x and y-axis are labeled 'Price' and 'Frequency', respectively.", "The function assumes that each product name in the 'Product' list has a corresponding price in the 'Price_String' list.", "The histogram plot's appearance (like color, alpha, and rwidth) is pre-set but can be customized further if needed."], "params": ["data (dict): A dictionary with two keys: 'Product' and 'Price_String'.", "'Product' is a list of product names, each name corresponding to a product.", "'Price_String' is a list of prices in string format, associated with these products.", "The price strings can contain commas for thousand separators and a period for the decimal point (e.g., \"1,234.56\")."], "returns": ["dict: Contains the calculated mean, median, and standard deviation (sample) of the prices.", "The keys are 'mean', 'median', and 'std_dev'.", "matplotlib.axes._subplots.Axes: A subplot object that represents the histogram plot of the product prices.", "The histogram displays the frequency distribution of the prices."], "reqs": ["pandas", "numpy", "matplotlib"], "raises": [], "example": [">>> results = f_923({'Product': ['Apple', 'Banana'], 'Price_String': ['1,234.00', '567.89']})", ">>> print(results)", "({'mean': 900.9449999999999, 'median': 900.9449999999999, 'std_dev': 471.0108980161712}, (array([1., 1.]), array([ 567.89 , 900.945, 1234. ]), ))"]}} +{"task_id": "f_328", "prompt": "import sqlite3\nimport pandas as pd\n\n\ndef f_328(db_file: str, query: str) -> pd.DataFrame:\n \"\"\"Query an SQLite database and return the results.\n\n This function connects to a given SQLite database, executes a given SQL query,\n and returns the results as a pandas DataFrame.\n\n Parameters:\n - db_file (str): Path to the SQLite database file.\n - query (str): SQL query to execute.\n\n Returns:\n - pd.DataFrame: A DataFrame containing the results of the executed query.\n\n Requirements:\n - sqlite3\n - pandas\n\n Example:\n >>> db_file = 'sample_database.db'\n >>> df = f_328(db_file, \"SELECT * FROM users WHERE name = 'John Doe'\")\n pd.DataFrame:\n id name age\n -- ---------- ---\n .. John Doe ..\n >>> df = f_328(db_file, \"SELECT age, COUNT(*) AS count FROM users GROUP BY age\")\n pd.DataFrame:\n age count\n --- -----\n 25 3\n \"\"\"", "canonical_solution": " with sqlite3.connect(db_file) as conn:\n return pd.read_sql_query(query, conn)", "test": "import unittest\nimport sqlite3\nfrom faker import Faker\nimport os\nclass TestCases(unittest.TestCase):\n fake = Faker()\n specific_names = [\n \"John Doe\",\n \"Jane Smith\",\n \"Alice Brown\",\n \"Bob White\",\n \"Charlie Green\",\n ]\n specific_ages = [25, 30, 35, 40, 45]\n @classmethod\n def setUpClass(cls):\n \"\"\"Set up test data before running tests.\"\"\"\n cls.db_file = cls.generate_test_data_with_file()\n @staticmethod\n def generate_test_data_with_file() -> str:\n \"\"\"Generate test data and save it to a temporary SQLite database file.\"\"\"\n db_file = \"./temp_test_db.sqlite3\"\n if os.path.exists(db_file):\n os.remove(db_file)\n conn = sqlite3.connect(db_file)\n create_table_query = \"\"\"\n CREATE TABLE users (\n id INTEGER PRIMARY KEY,\n name TEXT NOT NULL,\n age INTEGER NOT NULL\n )\n \"\"\"\n conn.execute(create_table_query)\n for _ in range(100):\n name = TestCases.fake.name()\n age = TestCases.fake.random_int(min=20, max=70)\n conn.execute(\"INSERT INTO users (name, age) VALUES (?, ?)\", (name, age))\n for name, age in zip(TestCases.specific_names, TestCases.specific_ages):\n conn.execute(\"INSERT INTO users (name, age) VALUES (?, ?)\", (name, age))\n conn.commit()\n conn.close()\n return db_file\n def test_case_1(self):\n \"\"\"Test fetching all users.\"\"\"\n df = f_328(self.db_file, \"SELECT * FROM users\")\n self.assertEqual(len(df), 100 + len(self.specific_names))\n for name in self.specific_names:\n self.assertIn(name, df[\"name\"].values)\n def test_case_2(self):\n \"\"\"Test fetching specific users based on names.\"\"\"\n names_as_strings = \"', '\".join(self.specific_names)\n df = f_328(\n self.db_file,\n f\"SELECT name, age FROM users WHERE name IN ('{names_as_strings}')\",\n )\n for name in self.specific_names:\n self.assertIn(name, df[\"name\"].values)\n for age in self.specific_ages:\n self.assertIn(age, df[\"age\"].values)\n def test_case_3(self):\n \"\"\"Test fetching users based on age condition.\"\"\"\n age_limit = self.fake.random_int(min=20, max=60)\n df = f_328(self.db_file, f\"SELECT * FROM users WHERE age > {age_limit}\")\n self.assertTrue(all(df[\"age\"] > age_limit))\n def test_case_4(self):\n \"\"\"Test fetching users and sorting by name.\"\"\"\n df = f_328(self.db_file, \"SELECT * FROM users ORDER BY name\")\n sorted_names = sorted(df[\"name\"].tolist())\n self.assertListEqual(df[\"name\"].tolist(), sorted_names)\n def test_case_5(self):\n \"\"\"Test fetching users based on age and sorting by age.\"\"\"\n age_limit = self.fake.random_int(min=20, max=30)\n df = f_328(\n self.db_file,\n f\"SELECT * FROM users WHERE age < {age_limit} ORDER BY age DESC\",\n )\n self.assertTrue(all(df[\"age\"] < age_limit))\n self.assertTrue(\n all(df[\"age\"].iloc[i] >= df[\"age\"].iloc[i + 1] for i in range(len(df) - 1))\n )\n @classmethod\n def tearDownClass(cls):\n \"\"\"Clean up test data after running tests.\"\"\"\n os.remove(cls.db_file)", "apis": ["pandas.DataFrame", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["pandas", "sqlite3"], "doc": {"description": ["Query an SQLite database and return the results.", "This function connects to a given SQLite database, executes a given SQL query,", "and returns the results as a pandas DataFrame."], "note": [], "params": ["db_file (str): Path to the SQLite database file.", "query (str): SQL query to execute."], "returns": ["pd.DataFrame: A DataFrame containing the results of the executed query."], "reqs": ["sqlite3", "pandas"], "raises": [], "example": [">>> db_file = 'sample_database.db'", ">>> df = f_328(db_file, \"SELECT * FROM users WHERE name = 'John Doe'\")", "pd.DataFrame:", "id name age", "-- ---------- ---", ".. John Doe ..", ">>> df = f_328(db_file, \"SELECT age, COUNT(*) AS count FROM users GROUP BY age\")", "pd.DataFrame:", "age count", "--- -----", "25 3"]}} +{"task_id": "f_741", "prompt": "import numpy as np\nimport random\n\ndef f_741(length=10000, seed=0):\n \"\"\"\n Generates a random walk of a specified length. A random walk is a path that consists of a series of random steps\n on some mathematical space. In this case, the steps are either +1 or -1, chosen with equal probability.\n\n Parameters:\n - length (int): The number of steps in the random walk. Must be a non-negative integer. Default is 10000.\n - seed (int, optional): An optional seed value to initialize the random number generator. Use this for reproducible results.\n \n Requirements:\n - numpy\n - random\n \n Returns:\n - np.array: A numpy array representing the positions of the walk at each step. Starts at 0.\n\n Raises:\n - ValueError: If `length` is negative.\n \n Example:\n >>> random.seed(0) # For reproducibility in doctest\n >>> walk = f_741(5)\n >>> walk.tolist()\n [0, 1, 2, 1, 0, 1]\n \"\"\"", "canonical_solution": " if length < 0:\n raise ValueError(\"length must be a non-negative integer\")\n random.seed(seed)\n steps = [1 if random.random() > 0.5 else -1 for _ in range(length)]\n walk = np.cumsum([0] + steps) # Starts at 0\n return walk", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def setUp(self):\n random.seed(42) # Setting seed for reproducibility\n def test_default_length(self):\n walk = f_741(seed=42)\n self.assertEqual(len(walk), 10001) # Includes starting point\n def test_custom_length(self):\n walk = f_741(5000, seed=42)\n self.assertEqual(len(walk), 5001) # Includes starting point\n def test_first_step_zero(self):\n walk = f_741(1, seed=42)\n self.assertEqual(walk[0], 0) # First position should be 0\n def test_negative_length(self):\n with self.assertRaises(ValueError):\n f_741(-1)\n def test_output_type(self):\n walk = f_741(5, seed=42)\n self.assertEqual(walk.tolist(), [0, 1, 0, -1, -2, -1])", "apis": ["numpy.cumsum", "random.random", "random.seed"], "libs": ["random", "numpy"], "doc": {"description": ["Generates a random walk of a specified length. A random walk is a path that consists of a series of random steps", "on some mathematical space. In this case, the steps are either +1 or -1, chosen with equal probability."], "note": [], "params": ["length (int): The number of steps in the random walk. Must be a non-negative integer. Default is 10000.", "seed (int, optional): An optional seed value to initialize the random number generator. Use this for reproducible results."], "returns": ["np.array: A numpy array representing the positions of the walk at each step. Starts at 0."], "reqs": ["numpy", "random"], "raises": ["ValueError: If `length` is negative."], "example": [">>> random.seed(0) # For reproducibility in doctest", ">>> walk = f_741(5)", ">>> walk.tolist()", "[0, 1, 2, 1, 0, 1]"]}} +{"task_id": "f_879", "prompt": "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_879(s1, s2):\n \"\"\"\n Visualize two Series using a swarm plot with a highlight on their intersecting data points.\n\n This function creates a swarm plot to visually compare two pandas Series. \n It highlights the intersection points between these two series by drawing red dashed lines at the intersecting data points.\n\n Parameters:\n - s1 (pd.Series): The first series of data. This series must have a unique name that identifies it in the plot.\n - s2 (pd.Series): The second series of data. Similar to s1, this series must also have a unique name.\n\n Returns:\n - ax (matplotlib.Axes): The Axes object of the plotted swarm chart. This object can be used for further customization of the plot if required.\n intersection_count (int): The number of unique intersecting data points between s1 and s2. \n This count gives a quick numerical summary of the overlap between the two series.\n\n Requirements:\n - pandas\n - seaborn\n - matplotlib\n\n Example:\n >>> s1 = pd.Series([1, 2, 3, 4, 5], name='Series1')\n >>> s2 = pd.Series([4, 5, 6, 7, 8], name='Series2')\n >>> ax, count = f_879(s1, s2)\n >>> ax.get_title()\n 'Overlap Between Series1 and Series2'\n \"\"\"", "canonical_solution": " # Find the intersection data points\n intersection = set(s1).intersection(set(s2))\n\n # Prepare data for visualization\n df1 = pd.DataFrame({s1.name: s1, \"Type\": \"Series1\"})\n df2 = pd.DataFrame({s2.name: s2, \"Type\": \"Series2\"})\n df = pd.concat([df1, df2], axis=0, ignore_index=True)\n\n # Create a swarm plot\n _, ax = plt.subplots(figsize=(10, 6))\n sns.swarmplot(x=df.columns[0], y=\"Type\", data=df, ax=ax)\n\n # Highlight intersection points\n for point in intersection:\n ax.axvline(x=point, color=\"red\", linestyle=\"--\")\n\n ax.set_title(f\"Overlap Between {s1.name} and {s2.name}\")\n\n return ax, len(intersection)", "test": "import pandas as pd\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_879.\"\"\"\n def test_intersection_exists(self):\n \"\"\"Test that the function works when the two series have an intersection.\"\"\"\n s1 = pd.Series([1, 2, 3, 4, 5], name=\"Series1\")\n s2 = pd.Series([4, 5, 6, 7, 8], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 2)\n def test_no_intersection(self):\n \"\"\"Test that the function works when the two series have no intersection.\"\"\"\n s1 = pd.Series([1, 2, 3], name=\"Series1\")\n s2 = pd.Series([4, 5, 6], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 0)\n def test_empty_series(self):\n \"\"\"Test that the function works when one of the series is empty.\"\"\"\n s1 = pd.Series([], name=\"Series1\")\n s2 = pd.Series([], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 0)\n def test_partial_intersection(self):\n \"\"\"Test that the function works when the two series have a partial intersection.\"\"\"\n s1 = pd.Series([1, 2], name=\"Series1\")\n s2 = pd.Series([2, 3], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 1)\n def test_identical_series(self):\n \"\"\"Test that the function works when the two series are identical.\"\"\"\n s1 = pd.Series([1, 2, 3], name=\"Series1\")\n s2 = pd.Series([1, 2, 3], name=\"Series2\")\n ax, intersection_count = f_879(s1, s2)\n self.assertEqual(ax.get_title(), \"Overlap Between Series1 and Series2\")\n self.assertEqual(intersection_count, 3)\n def tearDown(self):\n plt.clf()", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots", "seaborn.swarmplot", "pandas.concat"], "libs": ["seaborn", "pandas", "matplotlib"], "doc": {"description": ["Visualize two Series using a swarm plot with a highlight on their intersecting data points.", "This function creates a swarm plot to visually compare two pandas Series.", "It highlights the intersection points between these two series by drawing red dashed lines at the intersecting data points."], "note": [], "params": ["s1 (pd.Series): The first series of data. This series must have a unique name that identifies it in the plot.", "s2 (pd.Series): The second series of data. Similar to s1, this series must also have a unique name."], "returns": ["ax (matplotlib.Axes): The Axes object of the plotted swarm chart. This object can be used for further customization of the plot if required.", "intersection_count (int): The number of unique intersecting data points between s1 and s2.", "This count gives a quick numerical summary of the overlap between the two series."], "reqs": ["pandas", "seaborn", "matplotlib"], "raises": [], "example": [">>> s1 = pd.Series([1, 2, 3, 4, 5], name='Series1')", ">>> s2 = pd.Series([4, 5, 6, 7, 8], name='Series2')", ">>> ax, count = f_879(s1, s2)", ">>> ax.get_title()", "'Overlap Between Series1 and Series2'"]}} +{"task_id": "f_795", "prompt": "import pandas as pd\nimport random\nfrom datetime import datetime\n\n\ndef f_795(\n task_list,\n n_tasks,\n employees=[\"John Doe\", \"Jane Smith\", \"James Brown\", \"Mary Johnson\", \"Robert Davis\"],\n seed=None,\n):\n \"\"\"\n Randomly assigns a specified number of tasks to employees with a due date of the current day\n and returns a DataFrame with these assignments.\n\n Parameters:\n - task_list (list of str): List of tasks to be assigned.\n - n_tasks (int): Number of tasks to be assigned. This number should not be negative.\n - employees (list of str, optional): List of employee names to whom tasks can be assigned.\n If not provided, defaults to: ['John Doe', 'Jane Smith',\n 'James Brown', 'Mary Johnson', 'Robert Davis'].\n - seed (int, optional): Seed for the random number generator to ensure reproducibility. Defaults to None (not set).\n\n Returns:\n - pd.DataFrame: Contains columns 'Task Name', 'Assigned To', and 'Due Date', with each row representing an assigned task.\n\n Raises:\n - ValueError: If n_tasks is negative.\n\n Note:\n - Task names are sanitized by replacing spaces with underscores.\n - Due dates are set to the current system date.\n\n Requirements:\n - pandas\n - random\n - datetime\n\n Examples:\n >>> df = f_795(['Clean Office', 'Prepare Report', 'Client Meeting'], 2, seed=42)\n >>> df\n Task Name Assigned To Due Date\n 0 Client_Meeting John Doe 2024-04-13\n 1 Clean_Office James Brown 2024-04-13\n >>> type(df)\n \n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n if n_tasks < 0:\n raise ValueError(\"n_tasks cannot be negative.\")\n\n assignment_data = []\n for _ in range(n_tasks):\n if not task_list:\n break\n task_name = random.choice(task_list).replace(\" \", \"_\")\n employee = random.choice(employees)\n due_date = datetime.today().strftime(\"%Y-%m-%d\")\n assignment_data.append([task_name, employee, due_date])\n\n assignment_df = pd.DataFrame(\n assignment_data, columns=[\"Task Name\", \"Assigned To\", \"Due Date\"]\n )\n\n return assignment_df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.default_tasks = [\"Task_1\", \"Task_2\", \"Task_3\"]\n self.default_seed = 123\n self.expected_columns = {\"Task Name\", \"Assigned To\", \"Due Date\"}\n self.today_str = datetime.today().strftime(\"%Y-%m-%d\")\n def test_case_1(self):\n # Test basic functionality\n n_tasks = 2\n df = f_795(self.default_tasks, n_tasks, seed=self.default_seed)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(set(df.columns), self.expected_columns)\n self.assertEqual(len(df), n_tasks)\n self.assertTrue(all(df[\"Due Date\"] == self.today_str))\n self.assertTrue(all(\"_\" in name for name in df[\"Task Name\"]))\n def test_case_2(self):\n # List of tasks containing special characters and spaces\n tasks = [\"Task #1\", \"Task @2\", \"Task 3\"]\n n_tasks = 2\n df = f_795(tasks, n_tasks, seed=self.default_seed)\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), self.expected_columns)\n self.assertEqual(len(df), n_tasks)\n def test_case_3(self):\n # Test n_tasks\n for n_tasks in [2, 10, 20, 100]:\n df = f_795(self.default_tasks, n_tasks, seed=self.default_seed)\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), self.expected_columns)\n self.assertEqual(len(df), n_tasks)\n def test_case_4(self):\n # Test error handling - negative tasks\n with self.assertRaises(ValueError):\n f_795(self.default_tasks, -1, seed=self.default_seed)\n def test_case_5(self):\n # Test zero task\n df = f_795(self.default_tasks, 0, seed=self.default_seed)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(set(df.columns), self.expected_columns)\n self.assertEqual(len(df), 0)\n def test_case_6(self):\n # Test empty task list\n df = f_795([], 2, seed=self.default_seed)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(len(df), 0)\n def test_case_7(self):\n # Test custom employee\n custom_employees = [\"Alice\", \"Bob\", \"Charlie\"]\n df = f_795(\n self.default_tasks, 200, employees=custom_employees, seed=self.default_seed\n )\n self.assertTrue(\n all(employee in custom_employees for employee in df[\"Assigned To\"])\n )\n def test_case_8(self):\n # Test random seed\n df1 = f_795(self.default_tasks, 50, seed=0)\n df2 = f_795(self.default_tasks, 50, seed=0)\n df3 = f_795(self.default_tasks, 50, seed=100)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertFalse(df1.equals(df3))\n def test_case_9(self):\n # Test task name with spaces\n tasks = [\"Task One\", \"Task Two\"]\n df = f_795(tasks, 2, seed=42)\n self.assertSetEqual(set(df[\"Task Name\"]), {\"Task_One\", \"Task_Two\"})\n def test_case_10(self):\n # Test task list with duplicates\n tasks = [\"Task\", \"Task\"]\n df = f_795(tasks, 2, seed=42)\n self.assertEqual(len(df), len(tasks))\n self.assertEqual(set(df[\"Task Name\"]), {\"Task\"})", "apis": ["pandas.DataFrame", "datetime.datetime.today", "random.seed", "random.choice"], "libs": ["random", "pandas", "datetime"], "doc": {"description": ["Randomly assigns a specified number of tasks to employees with a due date of the current day", "and returns a DataFrame with these assignments."], "note": ["Task names are sanitized by replacing spaces with underscores.", "Due dates are set to the current system date."], "params": ["task_list (list of str): List of tasks to be assigned.", "n_tasks (int): Number of tasks to be assigned. This number should not be negative.", "employees (list of str, optional): List of employee names to whom tasks can be assigned.", "If not provided, defaults to: ['John Doe', 'Jane Smith',", "'James Brown', 'Mary Johnson', 'Robert Davis'].", "seed (int, optional): Seed for the random number generator to ensure reproducibility. Defaults to None (not set)."], "returns": ["pd.DataFrame: Contains columns 'Task Name', 'Assigned To', and 'Due Date', with each row representing an assigned task."], "reqs": ["pandas", "random", "datetime"], "raises": ["ValueError: If n_tasks is negative."], "example": ["Examples:", ">>> df = f_795(['Clean Office', 'Prepare Report', 'Client Meeting'], 2, seed=42)", ">>> df", "Task Name Assigned To Due Date", "0 Client_Meeting John Doe 2024-04-13", "1 Clean_Office James Brown 2024-04-13", ">>> type(df)", ""]}} +{"task_id": "f_897", "prompt": "import pandas as pd\nfrom sklearn.feature_extraction.text import CountVectorizer\nimport matplotlib.pyplot as plt\n\n# Constants\nSTOP_WORDS = [\"a\", \"an\", \"the\", \"in\", \"on\", \"at\", \"and\", \"or\"]\n\n\ndef f_897(file_path, save_path=None):\n \"\"\"\n Processes a CSV file containing text data and generates a histogram of the ten most common words.\n\n This function reads a CSV file, which is expected to contain a single column of text data. It then splits the text\n into words and creates a histogram of the frequency of the top ten most common words, excluding a predefined set of\n stopwords. The resulting histogram can be either displayed on the screen or saved to a file.\n\n The CSV file should have a single column with the header 'Text'. Each row under this column should contain a text string.\n If the CSV file does not have a header, the first column is assumed to be the text data.\n\n Parameters:\n - file_path (str): The path to the input CSV file.\n - save_path (str, optional): The path where the histogram plot will be saved. If not provided, the plot is displayed on the screen.\n\n Returns:\n - matplotlib.axes.Axes: The Axes object of the plot if save_path is not provided.\n Useful for further customization or display in notebooks.\n - None: If save_path is provided, the plot is saved to the specified path, \n and the function returns None.\n\n Raises:\n - FileNotFoundError: If the specified file_path does not exist. It raises a \n FileNotFoundError with a message indicating the file path that was not found.\n - Exception: For any other errors that occur during the function execution. \n In this case, the error is printed to the console, and None is returned.\n\n Requirements:\n - pandas\n - scikit-learn\n - matplotlib\n\n Notes:\n - The function uses pandas for data manipulation, sklearn's CountVectorizer for text vectorization, and matplotlib for plotting.\n - A predefined list of stopwords is used to filter out common but insignificant words from the histogram.\n\n Examples:\n >>> ax = f_897('text_data.csv')\n >>> print(ax)\n Axes(0.125,0.11;0.775x0.77)\n >>> result = f_897('text_data.csv', 'output_plot.png')\n >>> print(result)\n None\n \"\"\"", "canonical_solution": " try:\n # Reading the CSV file into a DataFrame\n df = pd.read_csv(file_path, usecols=[0], names=[\"Text\"], header=None)\n\n # Vectorizing the text\n vectorizer = CountVectorizer(stop_words=STOP_WORDS)\n word_count = vectorizer.fit_transform(df[\"Text\"].dropna())\n\n # Calculating word frequency\n sum_words = word_count.sum(axis=0)\n words_freq = [\n (word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()\n ]\n words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)\n\n # Preparing data for the top 10 words\n top_words = words_freq[:10]\n df_top = pd.DataFrame(top_words, columns=[\"Word\", \"Count\"])\n\n # Plotting\n ax = df_top.plot.bar(x=\"Word\", y=\"Count\", rot=0, legend=False)\n\n # Saving or displaying the plot\n if save_path:\n plt.savefig(save_path)\n plt.close()\n\n return None if save_path else ax\n\n except FileNotFoundError as exc:\n raise FileNotFoundError(f\"File not found: {file_path}\") from exc\n\n except Exception as e:\n print(f\"An error occurred: {e}\")\n return None", "test": "import unittest\nfrom unittest.mock import patch\nimport matplotlib.pyplot as plt\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_897.\"\"\"\n def tearDown(self):\n \"\"\"Clean up by removing files created during tests.\"\"\"\n plt.close()\n if os.path.exists(\"test_output.png\"):\n os.remove(\"test_output.png\")\n @patch(\"pandas.read_csv\")\n def test_display_plot(self, mock_read_csv):\n \"\"\"\n Test if the function displays a plot correctly when no save path is provided.\n \"\"\"\n # Mock data\n mock_read_csv.return_value = pd.DataFrame(\n {\"Text\": [\"word1 word2 word3\", \"word2 word3 word4\"]}\n )\n # Test\n result = f_897(\"dummy_path.csv\")\n print(result)\n self.assertIsNotNone(result)\n @patch(\"pandas.read_csv\")\n def test_save_plot(self, mock_read_csv):\n \"\"\"\n Test if the function saves a plot correctly when a save path is provided.\n \"\"\"\n # Mock data\n mock_read_csv.return_value = pd.DataFrame(\n {\"Text\": [\"word1 word2 word3\", \"word2 word3 word4\"]}\n )\n # Test\n result = f_897(\"dummy_path.csv\", \"test_output.png\")\n self.assertIsNone(result)\n self.assertTrue(os.path.exists(\"test_output.png\"))\n @patch(\"pandas.read_csv\")\n def test_empty_file(self, mock_read_csv):\n \"\"\"\n Test the function's behavior with an empty file.\n \"\"\"\n # Mock data\n mock_read_csv.return_value = pd.DataFrame({\"Text\": []})\n # Test\n result = f_897(\"dummy_path.csv\")\n self.assertIsNone(result)\n @patch(\"pandas.read_csv\")\n def test_invalid_file_path(self, mock_read_csv):\n \"\"\"\n Test the function's behavior with an invalid file path.\n \"\"\"\n mock_read_csv.side_effect = FileNotFoundError\n # Test\n with self.assertRaises(FileNotFoundError):\n f_897(\"invalid_path.csv\")\n @patch(\"pandas.read_csv\")\n def test_large_data_set(self, mock_read_csv):\n \"\"\"\n Test the function's behavior with a large data set.\n \"\"\"\n # Mock data: Generate a large dataset\n mock_read_csv.return_value = pd.DataFrame(\n {\"Text\": [\"word\" + str(i) for i in range(1000)]}\n )\n # Test\n result = f_897(\"dummy_path.csv\")\n self.assertIsNotNone(result)", "apis": ["pandas.read_csv", "sklearn.feature_extraction.text.CountVectorizer", "pandas.DataFrame", "matplotlib.pyplot.savefig", "matplotlib.pyplot.close"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["Processes a CSV file containing text data and generates a histogram of the ten most common words.", "This function reads a CSV file, which is expected to contain a single column of text data. It then splits the text", "into words and creates a histogram of the frequency of the top ten most common words, excluding a predefined set of", "stopwords. The resulting histogram can be either displayed on the screen or saved to a file.", "The CSV file should have a single column with the header 'Text'. Each row under this column should contain a text string.", "If the CSV file does not have a header, the first column is assumed to be the text data.", "Notes:", "- The function uses pandas for data manipulation, sklearn's CountVectorizer for text vectorization, and matplotlib for plotting.", "- A predefined list of stopwords is used to filter out common but insignificant words from the histogram."], "note": [], "params": ["file_path (str): The path to the input CSV file.", "save_path (str, optional): The path where the histogram plot will be saved. If not provided, the plot is displayed on the screen."], "returns": ["matplotlib.axes.Axes: The Axes object of the plot if save_path is not provided.", "Useful for further customization or display in notebooks.", "None: If save_path is provided, the plot is saved to the specified path,", "and the function returns None."], "reqs": ["pandas", "scikit-learn", "matplotlib"], "raises": ["FileNotFoundError: If the specified file_path does not exist. It raises a", "FileNotFoundError with a message indicating the file path that was not found.", "Exception: For any other errors that occur during the function execution.", "In this case, the error is printed to the console, and None is returned."], "example": ["Examples:", ">>> ax = f_897('text_data.csv')", ">>> print(ax)", "Axes(0.125,0.11;0.775x0.77)", ">>> result = f_897('text_data.csv', 'output_plot.png')", ">>> print(result)", "None"]}} +{"task_id": "f_916", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_916(list_of_lists):\n \"\"\"\n Generate a list of pandas Series objects, where each Series is indexed by the elements of a sub-list from `list_of_lists`.\n Each Series contains unique integers starting from 1 and going up to the length of the respective sub-list. These integers\n are shuffled randomly to create a unique ordering for each Series.\n\n Parameters:\n - list_of_lists (list of list): This parameter is expected to be a list where each element is itself a list.\n These inner lists are used as indices for the Series objects. Each inner list represents the index of one Series.\n\n Returns:\n - series_list (list of pandas.Series): This function returns a list. Each element in this list is a pandas Series object.\n The Series objects are indexed by the elements of the sub-lists provided in `list_of_lists`. The values in each Series\n are unique integers that are randomly shuffled.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n - Here's an example demonstrating how to use this function:\n >>> import numpy as np\n >>> np.random.seed(0) # Setting a seed for reproducibility of the example\n >>> series = f_916([['x', 'y', 'z'], ['a', 'b', 'c']])\n >>> for s in series: print(s)\n x 3\n y 2\n z 1\n dtype: int64\n a 3\n b 1\n c 2\n dtype: int64\n\n Note:\n - The function uses numpy's random shuffle, which modifies the sequence in-place. Therefore, each call to the function\n may produce different Series values unless the random seed is set beforehand.\n \"\"\"", "canonical_solution": " series_list = []\n for sublist in list_of_lists:\n values = np.arange(1, len(sublist) + 1)\n np.random.shuffle(values)\n s = pd.Series(values, index=sublist)\n series_list.append(s)\n\n return series_list", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_916.\"\"\"\n def test_basic_functionality(self):\n \"\"\"Test basic functionality of the function.\"\"\"\n np.random.seed(0)\n input_data = [[\"x\", \"y\", \"z\"], [\"a\", \"b\", \"c\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 2)\n expected_indexes = [[\"x\", \"y\", \"z\"], [\"a\", \"b\", \"c\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])\n def test_different_lengths(self):\n \"\"\"Test with sub-lists of different lengths.\"\"\"\n np.random.seed(1)\n input_data = [[\"m\", \"n\"], [\"p\", \"q\", \"r\", \"s\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 2)\n expected_indexes = [[\"m\", \"n\"], [\"p\", \"q\", \"r\", \"s\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])\n def test_single_element_list(self):\n \"\"\"Test with a single-element sub-list.\"\"\"\n np.random.seed(2)\n input_data = [[\"a\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 1)\n expected_indexes = [[\"a\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])\n def test_mixed_lengths(self):\n \"\"\"Test with sub-lists of different lengths.\"\"\"\n np.random.seed(3)\n input_data = [[\"x\", \"y\", \"z\"], [\"a\", \"b\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 2)\n expected_indexes = [[\"x\", \"y\", \"z\"], [\"a\", \"b\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])\n def test_multiple_series(self):\n \"\"\"Test with multiple sub-lists.\"\"\"\n np.random.seed(4)\n input_data = [[\"x\", \"y\"], [\"a\", \"b\"], [\"m\", \"n\", \"o\"]]\n result = f_916(input_data)\n self.assertEqual(len(result), 3)\n expected_indexes = [[\"x\", \"y\"], [\"a\", \"b\"], [\"m\", \"n\", \"o\"]]\n for i, s in enumerate(result):\n self.assertIsInstance(s, pd.Series)\n self.assertListEqual(list(s.index), expected_indexes[i])", "apis": ["numpy.random", "numpy.random.shuffle", "numpy.arange", "pandas.Series"], "libs": ["pandas", "numpy"], "doc": {"description": ["Generate a list of pandas Series objects, where each Series is indexed by the elements of a sub-list from `list_of_lists`.", "Each Series contains unique integers starting from 1 and going up to the length of the respective sub-list. These integers", "are shuffled randomly to create a unique ordering for each Series."], "note": ["The function uses numpy's random shuffle, which modifies the sequence in-place. Therefore, each call to the function", "may produce different Series values unless the random seed is set beforehand."], "params": ["list_of_lists (list of list): This parameter is expected to be a list where each element is itself a list.", "These inner lists are used as indices for the Series objects. Each inner list represents the index of one Series."], "returns": ["series_list (list of pandas.Series): This function returns a list. Each element in this list is a pandas Series object.", "The Series objects are indexed by the elements of the sub-lists provided in `list_of_lists`. The values in each Series", "are unique integers that are randomly shuffled."], "reqs": ["pandas", "numpy"], "raises": [], "example": ["- Here's an example demonstrating how to use this function:", ">>> import numpy as np", ">>> np.random.seed(0) # Setting a seed for reproducibility of the example", ">>> series = f_916([['x', 'y', 'z'], ['a', 'b', 'c']])", ">>> for s in series: print(s)", "x 3", "y 2", "z 1", "dtype: int64", "a 3", "b 1", "c 2", "dtype: int64"]}} +{"task_id": "f_765", "prompt": "import pandas as pd\nimport random\nimport re\n\ndef f_765(person_names, email_domains, num_records=5):\n \"\"\"\n Generate a DataFrame with a specified number of records containing personal names and emails. \n The emails are cleaned by replacing all occurrences of \"@\" with \"[at]\".\n \n Parameters:\n - person_names (list of str): A list of person names to use in the records.\n - email_domains (list of str): A list of email domains to use in the records.\n - num_records (int, optional): The number of records to generate. Default is 5.\n \n Returns:\n - DataFrame: A pandas DataFrame with columns 'Name' and 'Email' containing the person names and cleaned emails.\n \n Requirements:\n - pandas for DataFrame manipulation\n - random for random selection\n - re for regular expression operations\n \n Raises:\n - ValueError: If the number of names provided is less than the number of records requested or if no email domains are provided.\n \n Example:\n >>> random.seed(0) # Initialize random seed\n >>> f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2)\n Name Email\n 0 Jane Smith jane[at]gmail.com\n 1 John Doe john[at]yahoo.com\n >>> f_765(['Alice'], ['outlook.com'], 1)\n Name Email\n 0 Alice alice[at]outlook.com\n \"\"\"", "canonical_solution": " if len(person_names) < num_records or len(email_domains) == 0:\n raise ValueError(\"Insufficient number of names or domains provided.\")\n \n data = []\n \n # Randomly select 'num_records' names from the provided list\n selected_names = random.sample(person_names, num_records)\n\n for name in selected_names:\n email = re.sub('@', '[at]', '{}@{}'.format(name.split()[0].lower(), random.choice(email_domains)))\n data.append([name, email])\n\n df = pd.DataFrame(data, columns=['Name', 'Email'])\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n random.seed(0) # Initialize random seed\n result_df = f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2)\n self.assertTrue(isinstance(result_df, pd.DataFrame))\n self.assertEqual(len(result_df), 2)\n self.assertTrue(set(result_df.columns) == {'Name', 'Email'})\n self.assertTrue(all(result_df['Email'].str.contains('[at]')))\n \n def test_case_2(self):\n random.seed(0) # Initialize random seed\n result_df = f_765(['Alice'], ['outlook.com'], 1)\n self.assertTrue(isinstance(result_df, pd.DataFrame))\n self.assertEqual(len(result_df), 1)\n self.assertTrue(set(result_df.columns) == {'Name', 'Email'})\n self.assertTrue(all(result_df['Email'].str.contains('[at]')))\n \n def test_case_3(self):\n random.seed(0) # Initialize random seed\n with self.assertRaises(ValueError):\n f_765(['John Doe'], ['gmail.com'], 2)\n \n def test_case_4(self):\n random.seed(0) # Initialize random seed\n with self.assertRaises(ValueError):\n f_765(['John Doe', 'Jane Smith'], [], 2)\n \n def test_case_5(self):\n random.seed(0) # Initialize random seed\n result_df = f_765(['John Doe', 'Jane Smith', 'Bob'], ['gmail.com', 'yahoo.com'], 3)\n self.assertTrue(isinstance(result_df, pd.DataFrame))\n self.assertEqual(len(result_df), 3)\n self.assertTrue(set(result_df.columns) == {'Name', 'Email'})\n self.assertTrue(all(result_df['Email'].str.contains('[at]')))", "apis": ["re.sub", "random.choice", "pandas.DataFrame", "random.sample"], "libs": ["re", "pandas", "random"], "doc": {"description": ["Generate a DataFrame with a specified number of records containing personal names and emails.", "The emails are cleaned by replacing all occurrences of \"@\" with \"[at]\"."], "note": [], "params": ["person_names (list of str): A list of person names to use in the records.", "email_domains (list of str): A list of email domains to use in the records.", "num_records (int, optional): The number of records to generate. Default is 5."], "returns": ["DataFrame: A pandas DataFrame with columns 'Name' and 'Email' containing the person names and cleaned emails."], "reqs": ["pandas for DataFrame manipulation", "random for random selection", "re for regular expression operations"], "raises": ["ValueError: If the number of names provided is less than the number of records requested or if no email domains are provided."], "example": [">>> random.seed(0) # Initialize random seed", ">>> f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2)", "Name Email", "0 Jane Smith jane[at]gmail.com", "1 John Doe john[at]yahoo.com", ">>> f_765(['Alice'], ['outlook.com'], 1)", "Name Email", "0 Alice alice[at]outlook.com"]}} +{"task_id": "f_400", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_400(column, data):\n \"\"\"\n Analyze a list of sales data, calculate the sum, the mean, the minimum, the maximum of a given column,\n and return the bar chart plot for the given column without displaying it.\n\n Parameters:\n column (str): The column to analyze. Expected values are ['Product', 'Quantity Sold', 'Total Sales'].\n data (list): The sales data. Expected format: [['Product Name', Quantity Sold (int), Total Sales (int)], ...]\n The function checks for data validity in the quantity columns (must not be negative).\n\n Returns:\n tuple: A tuple containing:\n - dict: A dictionary with the sum, mean, min, max of the column.\n - matplotlib.axes.Axes: The Axes object of the plotted bar chart. The bar chart will have Product in its\n x-axis and the title Bar Chart of (column).\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> data = [['Product A', 100, 10000], ['Product B', 150, 15000], ['Product C', 200, 20000]]\n >>> stats, plot = f_400('Total Sales', data)\n >>> stats\n {'sum': 45000, 'mean': 15000.0, 'min': 10000, 'max': 20000}\n >>> plot\n \n \"\"\"", "canonical_solution": " COLUMNS = [\"Product\", \"Quantity Sold\", \"Total Sales\"]\n df = pd.DataFrame(data, columns=COLUMNS)\n if (df[\"Quantity Sold\"] < 0).any() or (df[\"Total Sales\"] < 0).any():\n raise ValueError(\"Value must not be negative\")\n column_data = df[column]\n\n result = {\n \"sum\": np.sum(column_data),\n \"mean\": np.mean(column_data),\n \"min\": np.min(column_data),\n \"max\": np.max(column_data),\n }\n\n ax = df.plot.bar(x=\"Product\", y=column, title=f\"Bar Chart of {column}\")\n\n return result, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test total sales\n scenarios = [\n (\n [\n [\"Product A\", 100, 10000],\n [\"Product B\", 150, 15000],\n [\"Product C\", 200, 20000],\n ],\n {\"sum\": 45000, \"mean\": 15000.0, \"min\": 10000, \"max\": 20000},\n ),\n (\n [\n [\"Product A\", 10, 1000],\n [\"Product B\", 20, 2000],\n [\"Product C\", 30, 3000],\n [\"Product D\", 40, 4000],\n ],\n {\"sum\": 10000, \"mean\": 2500.0, \"min\": 1000, \"max\": 4000},\n ),\n (\n [[\"Product A\", 5, 500]],\n {\"sum\": 500, \"mean\": 500.0, \"min\": 500, \"max\": 500},\n ),\n ]\n for data, expected in scenarios:\n with self.subTest(data=data):\n stats, ax = f_400(\"Total Sales\", data)\n self.assertDictEqual(stats, expected)\n self.assertEqual(ax.get_title(), \"Bar Chart of Total Sales\")\n plt.close(\"all\")\n def test_case_2(self):\n # Test quantity sold\n scenarios = [\n (\n [\n [\"Product A\", 100, 5000],\n [\"Product B\", 200, 6000],\n [\"Product C\", 300, 7000],\n ],\n {\"sum\": 600, \"mean\": 200.0, \"min\": 100, \"max\": 300},\n ),\n (\n [\n [\"Product A\", 5, 500],\n [\"Product B\", 10, 1000],\n [\"Product C\", 15, 1500],\n [\"Product D\", 20, 2000],\n [\"Product E\", 25, 2500],\n ],\n {\"sum\": 75, \"mean\": 15.0, \"min\": 5, \"max\": 25},\n ),\n ]\n for data, expected in scenarios:\n with self.subTest(data=data):\n stats, ax = f_400(\"Quantity Sold\", data)\n self.assertDictEqual(stats, expected)\n self.assertEqual(ax.get_title(), \"Bar Chart of Quantity Sold\")\n plt.close(\"all\")\n def test_case_3(self):\n # Test error handling - invalid column\n with self.assertRaises(KeyError):\n f_400(\"Invalid Column\", [[\"Product A\", 100, 10000]])\n def test_case_4(self):\n # Test error handling - empty data and negative values\n with self.assertRaises(Exception):\n f_400(\"Total Sales\", [])\n with self.assertRaises(Exception):\n f_400(\"Total Sales\", [[\"Product A\", -100, -10000]])\n def test_case_5(self):\n # Test plot data integrity\n data = [[\"Product A\", 100, 5000], [\"Product B\", 200, 10000]]\n _, ax = f_400(\"Quantity Sold\", data)\n bars = [rect.get_height() for rect in ax.patches]\n expected_bars = [100, 200]\n self.assertEqual(bars, expected_bars)\n plt.close(\"all\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.sum", "pandas.DataFrame", "numpy.mean", "numpy.min", "numpy.max"], "libs": ["pandas", "numpy"], "doc": {"description": ["Analyze a list of sales data, calculate the sum, the mean, the minimum, the maximum of a given column,", "and return the bar chart plot for the given column without displaying it."], "note": [], "params": ["column (str): The column to analyze. Expected values are ['Product', 'Quantity Sold', 'Total Sales'].", "data (list): The sales data. Expected format: [['Product Name', Quantity Sold (int), Total Sales (int)], ...]", "The function checks for data validity in the quantity columns (must not be negative)."], "returns": ["tuple: A tuple containing:", "dict: A dictionary with the sum, mean, min, max of the column.", "matplotlib.axes.Axes: The Axes object of the plotted bar chart. The bar chart will have Product in its", "x-axis and the title Bar Chart of (column)."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> data = [['Product A', 100, 10000], ['Product B', 150, 15000], ['Product C', 200, 20000]]", ">>> stats, plot = f_400('Total Sales', data)", ">>> stats", "{'sum': 45000, 'mean': 15000.0, 'min': 10000, 'max': 20000}", ">>> plot", ""]}} +{"task_id": "f_610", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\n\ndef f_610(data_path):\n \"\"\"\n Normalizes a dataset from a .csv file.\n \n Parameters:\n - data_path (str): The path to the csv data file.\n\n Returns:\n - df (DataFrame): The normalized dataset.\n\n Requirements:\n - pandas\n - sklearn\n \n Example:\n >>> df = f_610('path_to_data_file.csv')\n \"\"\"", "canonical_solution": " df = pd.read_csv(data_path)\n data = df.to_numpy()\n \n scaler = MinMaxScaler()\n data = scaler.fit_transform(data)\n\n df = pd.DataFrame(data, columns=df.columns)\n\n return df", "test": "import unittest\nimport os\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Create data\n data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (3, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 1)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 1)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 1)\n # Remove data\n os.remove('data.csv')\n def test_case_2(self):\n # Create data\n data = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (3, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 0)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 0)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 0)\n # Remove data\n os.remove('data.csv')\n def test_case_3(self):\n # Create data\n data = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (3, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 0)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 0)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 0)\n # Remove data\n os.remove('data.csv')\n def test_case_4(self):\n # Create data\n data = np.array([[3, 2, 1], [6, 5, 4], [9, 8, 7]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (3, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 1)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 1)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 1)\n # Remove data\n os.remove('data.csv')\n def test_case_5(self):\n # Create data\n data = np.array([[1, 2, 3], [4, 5, 6]])\n df = pd.DataFrame(data, columns=['a', 'b', 'c'])\n df.to_csv('data.csv', index=False)\n # Run function\n df = f_610('data.csv')\n # Check result\n self.assertEqual(df.shape, (2, 3))\n self.assertAlmostEqual(df['a'].min(), 0)\n self.assertAlmostEqual(df['a'].max(), 1)\n self.assertAlmostEqual(df['b'].min(), 0)\n self.assertAlmostEqual(df['b'].max(), 1)\n self.assertAlmostEqual(df['c'].min(), 0)\n self.assertAlmostEqual(df['c'].max(), 1)\n # Remove data\n os.remove('data.csv')", "apis": ["pandas.read_csv", "pandas.DataFrame", "sklearn.preprocessing.MinMaxScaler"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Normalizes a dataset from a .csv file."], "note": [], "params": ["data_path (str): The path to the csv data file."], "returns": ["df (DataFrame): The normalized dataset."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> df = f_610('path_to_data_file.csv')"]}} +{"task_id": "f_343", "prompt": "import pickle\nimport os\nimport matplotlib.pyplot as plt\n\n\ndef f_343(numbers, file_path=\"save.pkl\"):\n \"\"\"\n Save a Matplotlib image generated from the provided \"numbers\" list in a pickle file.\n The function then reads the image back from the file for validation and deletes the pickle file afterward.\n\n Parameters:\n - numbers (list): List of int/float values used to generate the matplotlib figure.\n - file_path (str): Path to temporary pickle file. Defaults to 'save.pkl'.\n\n Returns:\n - loaded_fig (matplotlib.figure.Figure): The loaded matplotlib figure from file_path.\n\n Requirements:\n - pickle\n - os\n - matplotlib.pyplot\n\n Example:\n >>> numbers = [random.random() for _ in range(100)]\n >>> loaded_fig = f_343(numbers)\n >>> type(loaded_fig)\n \n \"\"\"", "canonical_solution": "\n if not isinstance(numbers, list) or not all(\n isinstance(item, (int, float)) for item in numbers\n ):\n raise TypeError(\"Expect list of numbers.\")\n\n fig = plt.figure()\n plt.plot(numbers)\n\n with open(file_path, \"wb\") as file:\n pickle.dump(fig, file)\n\n with open(file_path, \"rb\") as file:\n loaded_fig = pickle.load(file)\n\n os.remove(file_path)\n\n return loaded_fig", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport tempfile\nimport os\nimport random\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n random.seed(0)\n def test_case_1(self):\n # Test default case - correct file was generated & correct removal\n numbers = list(range(10))\n loaded_fig = f_343(numbers)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(\"save.pkl\"), \"Pickle file was not deleted.\")\n def test_case_2(self):\n # Test when saving intermediate file to specified location\n numbers = list(range(10))\n path = os.path.join(self.temp_dir.name, \"default.pkl\")\n loaded_fig = f_343(numbers, path)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(path), \"Pickle file was not deleted.\")\n def test_case_3(self):\n # Test with floats\n numbers = [random.random() for _ in range(10)]\n loaded_fig = f_343(numbers)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(\"save.pkl\"), \"Pickle file was not deleted.\")\n def test_case_4(self):\n # Test with a mix of positive, negative, integer, and floating numbers\n numbers = [1, -1, 2.5, -2.5, 3, -3, 4.5, -4.5]\n loaded_fig = f_343(numbers)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(\"save.pkl\"), \"Pickle file was not deleted.\")\n def test_case_5(self):\n # Test with an empty list\n numbers = []\n loaded_fig = f_343(numbers)\n self.assertIsInstance(\n loaded_fig,\n type(plt.figure()),\n \"Returned object is not a Matplotlib figure.\",\n )\n self.assertFalse(os.path.exists(\"save.pkl\"), \"Pickle file was not deleted.\")\n def test_case_6(self):\n # Function should fail when there's invalid input\n with self.assertRaises(TypeError):\n f_343(\"123\")\n with self.assertRaises(TypeError):\n f_343([\"1\", \"2\", \"3\"])\n with self.assertRaises(TypeError):\n f_343([None, None, None])\n def tearDown(self):\n plt.close(\"all\")\n self.temp_dir.cleanup()", "apis": ["matplotlib.pyplot.figure", "os.remove", "matplotlib.pyplot.plot", "pickle.load", "pickle.dump"], "libs": ["os", "matplotlib", "pickle"], "doc": {"description": ["Save a Matplotlib image generated from the provided \"numbers\" list in a pickle file.", "The function then reads the image back from the file for validation and deletes the pickle file afterward."], "note": [], "params": ["numbers (list): List of int/float values used to generate the matplotlib figure.", "file_path (str): Path to temporary pickle file. Defaults to 'save.pkl'."], "returns": ["loaded_fig (matplotlib.figure.Figure): The loaded matplotlib figure from file_path."], "reqs": ["pickle", "os", "matplotlib.pyplot"], "raises": [], "example": [">>> numbers = [random.random() for _ in range(100)]", ">>> loaded_fig = f_343(numbers)", ">>> type(loaded_fig)", ""]}} +{"task_id": "f_748", "prompt": "import zipfile\nimport os\nimport re\nimport shutil\n\ndef f_748(source_dir: str, target_dir: str, archive_name: str = 'archive.zip') -> str:\n \"\"\"\n Archives all processed files from a source directory to a target directory.\n The function identifies processed files by the '_processed' suffix in the filename.\n\n Parameters:\n source_dir (str): The directory containing the files to be archived.\n target_dir (str): The directory where the archive will be saved.\n archive_name (str): The name of the archive file. Default is 'archive.zip'.\n\n Returns:\n str: The path to the created archive.\n\n Requirements:\n - os\n - re\n - shutil\n - zipfile\n\n Example:\n >>> f_748('./data/', './data_processed/')\n './data_processed/archive.zip'\n >>> f_748('./data/', './data_processed/', 'my_archive.zip')\n './data_processed/my_archive.zip'\n \"\"\"", "canonical_solution": " \n # Create directories if they don't exist\n os.makedirs(source_dir, exist_ok=True)\n os.makedirs(target_dir, exist_ok=True)\n \n archive_path = os.path.join(target_dir, archive_name)\n \n with zipfile.ZipFile(archive_path, 'w') as archive:\n for file in os.listdir(source_dir):\n if re.search(r'_processed$', os.path.splitext(file)[0]):\n archive.write(os.path.join(source_dir, file), arcname=file)\n shutil.move(os.path.join(source_dir, file), target_dir)\n \n return archive_path", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Setup test directories\n self.source_dir = 'f_748_data_wenhao/'\n self.target_dir = 'f_748_data_wenhao_target/'\n \n # Remove any existing test directories to start fresh\n if os.path.exists(self.source_dir):\n shutil.rmtree(self.source_dir)\n if os.path.exists(self.target_dir):\n shutil.rmtree(self.target_dir)\n # Create new test directories\n os.makedirs(self.source_dir)\n os.makedirs(self.target_dir)\n def tearDown(self):\n # Clean up test directories after each test case\n if os.path.exists(self.source_dir):\n shutil.rmtree(self.source_dir)\n if os.path.exists(self.target_dir):\n shutil.rmtree(self.target_dir)\n \n def test_case_1(self):\n # Create some test files in the source directory, some with '_processed' suffix\n test_files = ['file1.txt', 'file2_processed.txt']\n for file in test_files:\n with open(os.path.join(self.source_dir, file), 'w') as f:\n f.write(f\"This is {file}\")\n \n # Archive processed files\n archive_path = f_748(self.source_dir, self.target_dir)\n \n # Check if the archive contains the correct file\n with zipfile.ZipFile(archive_path, 'r') as archive:\n self.assertIn('file2_processed.txt', archive.namelist())\n \n def test_case_2(self):\n # Create some test files in the source directory without '_processed' suffix\n test_files = ['file1.txt', 'file3.txt']\n for file in test_files:\n with open(os.path.join(self.source_dir, file), 'w') as f:\n f.write(f\"This is {file}\")\n \n # Archive processed files\n archive_path = f_748(self.source_dir, self.target_dir)\n \n # Check if the archive is empty\n with zipfile.ZipFile(archive_path, 'r') as archive:\n self.assertEqual(len(archive.namelist()), 0)\n \n def test_case_3(self):\n # Source directory is empty\n archive_path = f_748(self.source_dir, self.target_dir)\n \n # Check if the archive is empty\n with zipfile.ZipFile(archive_path, 'r') as archive:\n self.assertEqual(len(archive.namelist()), 0)\n def test_case_4(self):\n # Create some test files in the source directory, some with '_processed' suffix\n test_files = ['file1.txt', 'file2_processed.txt']\n for file in test_files:\n with open(os.path.join(self.source_dir, file), 'w') as f:\n f.write(f\"This is {file}\")\n \n # Archive processed files with a custom archive name\n custom_archive_name = 'custom_archive.zip'\n archive_path = f_748(self.source_dir, self.target_dir, custom_archive_name)\n \n # Check if the custom archive name is used\n self.assertTrue(custom_archive_name in archive_path)\n \n def test_case_5(self):\n # Check the return value for correct archive path\n archive_path = f_748(self.source_dir, self.target_dir)\n expected_path = os.path.join(self.target_dir, 'archive.zip')\n self.assertEqual(archive_path, expected_path)", "apis": ["zipfile.ZipFile", "os.path.splitext", "os.listdir", "os.makedirs", "re.search", "os.path", "os.path.join", "shutil.move"], "libs": ["zipfile", "shutil", "re", "os"], "doc": {"description": ["Archives all processed files from a source directory to a target directory.", "The function identifies processed files by the '_processed' suffix in the filename."], "note": [], "params": ["source_dir (str): The directory containing the files to be archived.", "target_dir (str): The directory where the archive will be saved.", "archive_name (str): The name of the archive file. Default is 'archive.zip'."], "returns": ["str: The path to the created archive."], "reqs": ["os", "re", "shutil", "zipfile"], "raises": [], "example": [">>> f_748('./data/', './data_processed/')", "'./data_processed/archive.zip'", ">>> f_748('./data/', './data_processed/', 'my_archive.zip')", "'./data_processed/my_archive.zip'"]}} +{"task_id": "f_787", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\ndef f_787(start_date='2016-01-01', periods=13, freq='WOM-2FRI', seed=0):\n \"\"\"\n Generate a share price series for a specific period of time, plot the share prices, and return the DataFrame and the plot on the share prices over the given date range.\n\n Parameters:\n - start_date (str): The start date for the share price series in 'YYYY-MM-DD' format. Default is '2016-01-01'.\n - periods (int): The number of periods for which the share price needs to be generated. Default is 13.\n - freq (str): The frequency string conforming to pandas date offset aliases. Default is 'WOM-2FRI'.\n - seed (int, optional): The seed for the random number generator to ensure reproducibility. Default is None.\n\n Returns:\n - A tuple containing a pandas DataFrame with columns ['Date', 'Price'] and a Matplotlib Axes object for the plot.\n\n Examples:\n >>> df, ax = f_787('2020-01-01', 5, 'M', seed=42)\n >>> len(df)\n 5\n >>> df.iloc[0]['Price']\n 249.81604753894499\n >>> ax.title.get_text()\n 'Stock Prices'\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n date_range = pd.date_range(start=start_date, periods=periods, freq=freq)\n stock_prices = np.random.uniform(low=100, high=500, size=periods)\n\n prices_df = pd.DataFrame({'Date': date_range, 'Price': stock_prices})\n prices_df.set_index('Date', inplace=True)\n\n fig, ax = plt.subplots(figsize=(10, 6))\n # ax.plot(prices_df.index, prices_df['Price'], marker='o')\n prices_df.plot(ax=ax, marker='o')\n pd.plotting.register_matplotlib_converters()\n ax.set_title('Stock Prices')\n ax.set_xlabel('Date')\n ax.set_ylabel('Price')\n ax.grid(True)\n \n return prices_df, ax", "test": "import unittest\nimport pandas as pd\nfrom pandas.tseries.frequencies import to_offset\nfrom matplotlib import axes\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \n def test_default_parameters(self):\n df, ax = f_787(seed=42)\n self.assertIsInstance(df, pd.DataFrame, \"The output should be a pandas DataFrame\")\n self.assertIsInstance(ax, axes.Axes, \"The output should be a Matplotlib Axes object\")\n self.assertEqual(len(df), 13, \"DataFrame should contain 13 rows by default\")\n self.assertTrue((100 <= df['Price']).all() and (df['Price'] <= 500).all(), \"Stock prices should be between 100 and 500\")\n self.assertEqual(ax.title.get_text(), 'Stock Prices', \"Plot title should be 'Stock Prices'\")\n \n def test_specified_parameters(self):\n df, ax = f_787('2021-01-01', 5, 'M', seed=42)\n self.assertEqual(len(df), 5, \"DataFrame should contain 5 rows\")\n self.assertTrue((100 <= df['Price']).all() and (df['Price'] <= 500).all(), \"Stock prices should be between 100 and 500\")\n \n def test_business_day_frequency(self):\n df, ax = f_787('2021-01-01', 5, 'B', seed=42)\n self.assertEqual(len(df), 5, \"DataFrame should contain 5 rows\")\n \n def test_weekly_frequency_more_periods(self):\n df, ax = f_787('2021-01-01', 20, 'W', seed=42)\n self.assertEqual(len(df), 20, \"DataFrame should contain 20 rows\")\n \n def test_different_year(self):\n df, ax = f_787('2019-01-01', 10, 'W', seed=42)\n self.assertEqual", "apis": ["pandas.DataFrame", "numpy.random", "numpy.random.uniform", "pandas.plotting.register_matplotlib_converters", "pandas.date_range", "matplotlib.pyplot.subplots", "numpy.random.seed", "pandas.plotting"], "libs": ["numpy", "pandas", "matplotlib"], "doc": {"description": ["Generate a share price series for a specific period of time, plot the share prices, and return the DataFrame and the plot on the share prices over the given date range."], "note": [], "params": ["start_date (str): The start date for the share price series in 'YYYY-MM-DD' format. Default is '2016-01-01'.", "periods (int): The number of periods for which the share price needs to be generated. Default is 13.", "freq (str): The frequency string conforming to pandas date offset aliases. Default is 'WOM-2FRI'.", "seed (int, optional): The seed for the random number generator to ensure reproducibility. Default is None."], "returns": ["A tuple containing a pandas DataFrame with columns ['Date', 'Price'] and a Matplotlib Axes object for the plot."], "reqs": [], "raises": [], "example": ["Examples:", ">>> df, ax = f_787('2020-01-01', 5, 'M', seed=42)", ">>> len(df)", "5", ">>> df.iloc[0]['Price']", "249.81604753894499", ">>> ax.title.get_text()", "'Stock Prices'"]}} {"task_id": "f_557", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\ndef f_557(df):\n \"\"\"\n Given a Pandas DataFrame with random numeric values, standardize it with the standard scaler from sklearn.\n\n Parameters:\n - df (DataFrame): The DataFrame to be standardized.\n \n Returns:\n - df_standardized (DataFrame): The standardized DataFrame.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})\n >>> f_557(df)\n a b\n 0 -1.224745 -1.224745\n 1 0.000000 0.000000\n 2 1.224745 1.224745\n \"\"\"", "canonical_solution": " # Standardize data\n scaler = StandardScaler()\n df_standardized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)\n return df_standardized", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})\n df_standardized = f_557(df)\n self.assertAlmostEqual(df_standardized['a'].mean(), 0)\n self.assertAlmostEqual(df_standardized['a'].std(), 1.224744871391589)\n def test_case_2(self):\n df = pd.DataFrame({'a': [1, 1, 1], 'b': [1, 1, 1]})\n df_standardized = f_557(df)\n self.assertAlmostEqual(df_standardized['a'].mean(), 0)\n self.assertAlmostEqual(df_standardized['a'].std(), 0)\n def test_case_3(self):\n df = pd.DataFrame({'a': [1, 0, -1], 'b': [0, 1, 0]})\n df_standardized = f_557(df)\n print(df_standardized)\n self.assertAlmostEqual(df_standardized['a'].mean(), 0)\n self.assertAlmostEqual(df_standardized['a'].std(), 1.224744871391589)\n def test_case_4(self):\n df = pd.DataFrame({'z': [1, 2, 3], 'y': [4, 5, 6]})\n df_standardized = f_557(df)\n self.assertAlmostEqual(df_standardized['z'].mean(), 0)\n self.assertAlmostEqual(df_standardized['z'].std(), 1.224744871391589)\n def test_case_5(self):\n df = pd.DataFrame({'z': [1, 2, 3], 'y': [4, 5, 6]})\n df_standardized = f_557(df)\n self.assertAlmostEqual(df_standardized['y'].mean(), 0)\n self.assertAlmostEqual(df_standardized['y'].std(), 1.224744871391589)", "apis": ["pandas.DataFrame", "sklearn.preprocessing.StandardScaler"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Given a Pandas DataFrame with random numeric values, standardize it with the standard scaler from sklearn."], "note": [], "params": ["df (DataFrame): The DataFrame to be standardized."], "returns": ["df_standardized (DataFrame): The standardized DataFrame."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})", ">>> f_557(df)", "a b", "0 -1.224745 -1.224745", "1 0.000000 0.000000", "2 1.224745 1.224745"]}} -{"task_id": "f_329", "prompt": "import pandas as pd\nimport json\n\n\ndef f_329(data: dict, output_path: str = \"./default_data_output.json\") -> str:\n \"\"\"Converts the given DataFrame to a dictionary, dropping the column named 'c'\n if it exists, and then saves it as a JSON file.\n\n Parameters:\n - data (dict): The input data dictionary.\n - output_path (str, optional): The path where the JSON file should be saved. Default is './default_data_output.json'.\n\n Returns:\n - str: Path where the JSON file was saved.\n\n Requirements:\n - pandas\n - json\n\n Example:\n >>> f_329({'a': [1,2], 'b': [3,4], 'c': [5,6]})\n './default_data_output.json'\n >>> f_329({'a': [1,2], 'b': [3,4], 'c': [5,6]}, 'custom/path/results.json')\n 'custom/path/results.json'\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n # Drop column named 'c' if it exists\n df = df.drop(columns=\"c\", errors=\"ignore\")\n # Convert the DataFrame to dictionary\n data_dict = df.to_dict(orient=\"dict\")\n # Save the dictionary as a JSON file\n with open(output_path, \"w\") as file:\n json.dump(data_dict, file)\n\n return output_path", "test": "import unittest\nimport pandas as pd\nimport json\nimport os\nclass TestCases(unittest.TestCase):\n def read_json_file(self, path):\n # Helper function to read content from a JSON file\n with open(path, \"r\") as f:\n return json.load(f)\n def tearDown(self):\n # Cleanup procedure after each test to remove generated files\n files_to_remove = [\n \"./default_data_output.json\",\n \"./custom_data_output_2.json\",\n \"./custom_data_output_3.json\",\n \"./custom_data_output_4.json\",\n \"./custom_data_output_5.json\",\n ]\n for file in files_to_remove:\n if os.path.exists(file):\n os.remove(file)\n def convert_keys_to_str(self, dictionary):\n # Convert dictionary keys to strings recursively\n if not isinstance(dictionary, dict):\n return dictionary\n return {str(k): self.convert_keys_to_str(v) for k, v in dictionary.items()}\n def test_case_1(self):\n # Test basic DataFrame with column \"c\"\n data = {\"a\": [1, 2], \"b\": [3, 4], \"c\": [5, 6]}\n df = pd.DataFrame(data)\n output_path = f_329(data)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(\n df.drop(columns=\"c\").to_dict(orient=\"dict\")\n )\n self.assertEqual(self.read_json_file(output_path), expected_data)\n def test_case_2(self):\n # Test DataFrame with non-numeric data and column \"c\"\n data = {\"name\": [\"Alice\", \"Bob\"], \"country\": [\"USA\", \"Canada\"], \"c\": [\"x\", \"y\"]}\n df = pd.DataFrame(data)\n custom_path = \"./custom_data_output_2.json\"\n output_path = f_329(data, custom_path)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(\n df.drop(columns=\"c\").to_dict(orient=\"dict\")\n )\n self.assertEqual(self.read_json_file(output_path), expected_data)\n def test_case_3(self):\n # Test DataFrame with multiple columns and no column \"c\"\n data = {\"age\": [25, 30], \"height\": [170, 175]}\n df = pd.DataFrame(data)\n custom_path = \"./custom_data_output_3.json\"\n output_path = f_329(data, custom_path)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(df.to_dict(orient=\"dict\"))\n self.assertEqual(self.read_json_file(output_path), expected_data)\n def test_case_4(self):\n # Test DataFrame with mixed data types including column \"c\"\n data = {\n \"id\": [1, 2],\n \"is_student\": [True, False],\n \"grades\": [\"A\", \"B\"],\n \"c\": [0.5, 0.8],\n }\n df = pd.DataFrame(data)\n output_path = f_329(data)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(\n df.drop(columns=\"c\").to_dict(orient=\"dict\")\n )\n self.assertEqual(self.read_json_file(output_path), expected_data)\n def test_case_5(self):\n # Test an empty DataFrame\n data = {}\n df = pd.DataFrame(data)\n custom_path = \"./custom_data_output_5.json\"\n output_path = f_329(data, custom_path)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(df.to_dict(orient=\"dict\"))\n self.assertEqual(self.read_json_file(output_path), expected_data)", "apis": ["pandas.DataFrame", "json.dump"], "libs": ["pandas", "json"], "doc": {"description": ["Converts the given DataFrame to a dictionary, dropping the column named 'c'", "if it exists, and then saves it as a JSON file."], "note": [], "params": ["data (dict): The input data dictionary.", "output_path (str, optional): The path where the JSON file should be saved. Default is './default_data_output.json'."], "returns": ["str: Path where the JSON file was saved."], "reqs": ["pandas", "json"], "raises": [], "example": [">>> f_329({'a': [1,2], 'b': [3,4], 'c': [5,6]})", "'./default_data_output.json'", ">>> f_329({'a': [1,2], 'b': [3,4], 'c': [5,6]}, 'custom/path/results.json')", "'custom/path/results.json'"]}} -{"task_id": "f_380", "prompt": "import pandas as pd\nimport random\nimport re\n\n\ndef f_380(data_list, seed=None):\n \"\"\"\n Apply a random operation (remove, replace, shuffle, or randomize) to substrings in a list of strings.\n\n This function processes a list of comma-separated strings by applying one of four random operations to\n their substrings: remove, replace, shuffle, or randomize. Here, a substring refers to the individual\n items in the string that are separated by commas, sensitive to leading/trailing whitespace, i.e.\n 'apple' != 'apple ', and sensitive to case, i.e. 'APPLE' != 'aPPLE'.\n\n The choice of operation and the substrings it affects are determined randomly. The operations are:\n - Remove: Randomly selects and removes a substring.\n If a string contains only one substring, no 'remove' operation is applied.\n - Replace: Randomly selects a substring and replaces it with 'random_string'.\n - Shuffle: Randomly shuffles the order of the substrings.\n - Randomize: Assigns a new, random order to the substrings.\n\n Finally, the function returns a DataFrame with column 'Original String' containing the input strings\n and the 'Modified String' column containing the strings after applying the random operation.\n\n Parameters:\n - data_list (list): The list of strings. If empty, function will return a DataFrame with the expected\n columns that is otherwise empty.\n - seed (int, optional): A seed for the random operations to ensure reproducibility. Default is None.\n\n Returns:\n df (pd.DataFrame): DataFrame containing original and modified strings.\n\n Requirements:\n - pandas\n - random\n - re\n\n Example:\n >>> f_380(['lamp, bag, mirror', 'table, chair, bag, lamp'], seed=0)\n Original String Modified String\n 0 lamp, bag, mirror bag, lamp, mirror\n 1 table, chair, bag, lamp lamp, chair, bag, table\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n df = pd.DataFrame(data_list, columns=[\"Original String\"])\n\n modified_strings = []\n for s in data_list:\n substrings = re.split(\", \", s)\n operation = random.choice([\"remove\", \"replace\", \"shuffle\", \"randomize\"])\n if operation == \"remove\":\n if len(substrings) > 1:\n random_substring = random.choice(substrings)\n substrings.remove(random_substring)\n modified_s = \", \".join(substrings)\n else:\n modified_s = s\n elif operation == \"replace\":\n random_substring_index = random.choice(range(len(substrings)))\n substrings[random_substring_index] = \"random_string\"\n modified_s = \", \".join(substrings)\n elif operation == \"shuffle\":\n random.shuffle(substrings)\n modified_s = \", \".join(substrings)\n elif operation == \"randomize\":\n random_positions = random.sample(range(len(substrings)), len(substrings))\n modified_s = \", \".join([substrings[i] for i in random_positions])\n modified_strings.append(modified_s)\n\n df[\"Modified String\"] = modified_strings\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n default_seed = 42\n def test_case_1(self):\n # Test basic functionality\n data_list = [\"lamp, bag, mirror\", \"table, chair, bag, lamp\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(result[\"Original String\"].tolist(), data_list)\n self.assertNotEqual(result[\"Original String\"][0], result[\"Modified String\"][0])\n self.assertNotEqual(result[\"Original String\"][1], result[\"Modified String\"][1])\n def test_case_2(self):\n # Test single string\n data_list = [\"apple, orange, banana\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(result[\"Original String\"].tolist(), data_list)\n self.assertNotEqual(result[\"Original String\"][0], result[\"Modified String\"][0])\n def test_case_3(self):\n # Test single character\n data_list = [\"a, b, c\", \"d, e, f\", \"g, h, i\", \"j, k, l\", \"m, n, o\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(result[\"Original String\"].tolist(), data_list)\n for idx in range(len(data_list)):\n self.assertNotEqual(\n result[\"Original String\"][idx], result[\"Modified String\"][idx]\n )\n def test_case_4(self):\n # Test whitespace sensitivity\n data_list = [\"apple, apple, apple \", \" apple, apple , apple \"]\n result = f_380(data_list, seed=self.default_seed)\n modified_strings = result[\"Modified String\"].tolist()\n self.assertTrue(\n all(\n original != modified\n for original, modified in zip(data_list, modified_strings)\n ),\n \"The function should treat substrings differently based on whitespace.\",\n )\n def test_case_5(self):\n # Test case sensitivity\n data_list = [\"apple, Apple\", \"APPLE, apple\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(result[\"Original String\"].tolist(), data_list)\n # Checking that modifications respect case sensitivity\n self.assertNotEqual(result[\"Modified String\"][0], result[\"Modified String\"][1])\n def test_case_6(self):\n # Test same random seed produces same results\n data_list = [\"lamp, bag, mirror\", \"table, chair, bag, lamp\"]\n result1 = f_380(data_list, seed=self.default_seed)\n result2 = f_380(data_list, seed=self.default_seed)\n pd.testing.assert_frame_equal(result1, result2)\n def test_case_7(self):\n # Test function integrity by calculating expected results with fixed random seed\n data_list = [\"a, b, c\", \"d, e, f\"]\n expected_modifications = [\"b, c\", \"e, f, d\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(\n result[\"Modified String\"].tolist(),\n expected_modifications,\n \"With a fixed seed, the modifications should be predictable and reproducible.\",\n )\n def test_case_8(self):\n # Test invalid input handling\n for invalid_data_list in [\n [1, 2, 3],\n [None, \"apple\"],\n [None, None],\n [1, \"orange\", 3],\n ]:\n with self.assertRaises(TypeError):\n f_380(invalid_data_list, seed=self.default_seed)\n def test_case_9(self):\n # Test empty list input\n data_list = []\n result = f_380(data_list, seed=self.default_seed)\n self.assertTrue(\n result.empty,\n \"The result should be an empty DataFrame for an empty input list.\",\n )\n def test_case_10(self):\n # Test input list with an empty string\n data_list = [\"\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(\n result[\"Modified String\"].tolist(),\n [\"\"],\n \"An empty string should remain unchanged.\",\n )\n def test_case_11(self):\n # Test input with a single substring (no commas)\n data_list = [\"single\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(\n result[\"Modified String\"].tolist(),\n [\"single\"],\n \"A single substring should remain unchanged.\",\n )", "apis": ["random.shuffle", "random.seed", "random.choice", "random.sample", "re.split", "pandas.DataFrame"], "libs": ["pandas", "re", "random"], "doc": {"description": ["Apply a random operation (remove, replace, shuffle, or randomize) to substrings in a list of strings.", "This function processes a list of comma-separated strings by applying one of four random operations to", "their substrings: remove, replace, shuffle, or randomize. Here, a substring refers to the individual", "items in the string that are separated by commas, sensitive to leading/trailing whitespace, i.e.", "'apple' != 'apple ', and sensitive to case, i.e. 'APPLE' != 'aPPLE'.", "The choice of operation and the substrings it affects are determined randomly. The operations are:", "- Remove: Randomly selects and removes a substring.", "If a string contains only one substring, no 'remove' operation is applied.", "- Replace: Randomly selects a substring and replaces it with 'random_string'.", "- Shuffle: Randomly shuffles the order of the substrings.", "- Randomize: Assigns a new, random order to the substrings.", "Finally, the function returns a DataFrame with column 'Original String' containing the input strings", "and the 'Modified String' column containing the strings after applying the random operation."], "note": [], "params": ["data_list (list): The list of strings. If empty, function will return a DataFrame with the expected", "columns that is otherwise empty.", "seed (int, optional): A seed for the random operations to ensure reproducibility. Default is None."], "returns": ["df (pd.DataFrame): DataFrame containing original and modified strings."], "reqs": ["pandas", "random", "re"], "raises": [], "example": [">>> f_380(['lamp, bag, mirror', 'table, chair, bag, lamp'], seed=0)", "Original String Modified String", "0 lamp, bag, mirror bag, lamp, mirror", "1 table, chair, bag, lamp lamp, chair, bag, table"]}} -{"task_id": "f_371", "prompt": "import matplotlib.pyplot as plt\nfrom sklearn.cluster import KMeans\n\n\ndef f_371(myList, n_clusters):\n \"\"\"\n Cluster a list of 2D points using KMeans and visualize the clusters.\n\n Note: This function raises ValueError if it encounters invalid inputs.\n KMeans is performed with random_state = 42 and n_init = 10. Scatterplot\n uses red 'x' markers for cluster centers.\n\n Parameters:\n - myList (list): List of 2D points.\n - n_clusters (int): Number of clusters to form.\n\n Returns:\n - matplotlib.axes._axes.Axes: Axes object with the plotted clusters.\n\n Requirements:\n - matplotlib.pyplot\n - sklearn.cluster.KMeans\n\n Example:\n >>> myList = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]\n >>> ax = f_371(myList, 2)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5'), Text(6.0, 0, '6'), Text(7.0, 0, '7'), Text(8.0, 0, '8'), Text(9.0, 0, '9'), Text(10.0, 0, '10')]\n \"\"\"", "canonical_solution": " if not myList or n_clusters <= 0:\n raise ValueError(\"Invalid inputs\")\n\n kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)\n kmeans.fit(myList)\n\n fig, ax = plt.subplots()\n ax.scatter(*zip(*myList), c=kmeans.labels_)\n ax.scatter(*zip(*kmeans.cluster_centers_), marker=\"x\", color=\"red\")\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.test_list = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]\n def test_case_1(self):\n # Test single cluster\n myList = [[1, 1], [1, 1], [1, 1], [1, 1]]\n ax = f_371(myList, 1)\n self.assertEqual(len(set(ax.collections[0].get_array())), 1)\n def test_case_2(self):\n # Test arbitrary number of clusters\n myList = self.test_list\n for n in range(1, 6):\n ax = f_371(myList, n)\n self.assertEqual(len(set(ax.collections[0].get_array())), n)\n def test_case_3(self):\n # Test visualization\n myList = self.test_list\n ax = f_371(myList, 2)\n red_collection = next(\n coll\n for coll in ax.collections\n if (\n coll.get_facecolor()[0][0] == 1.0\n and coll.get_facecolor()[0][1] == 0.0\n and coll.get_facecolor()[0][2] == 0.0\n )\n )\n red_x_markers_count = len(red_collection.get_offsets())\n self.assertEqual(red_x_markers_count, 2)\n def test_case_4(self):\n # Test handling invalid inputs\n with self.assertRaises(ValueError):\n f_371([], 1)\n with self.assertRaises(ValueError):\n f_371([[1, 1], [2, 2]], 0)\n with self.assertRaises(ValueError):\n f_371(self.test_list, len(self.test_list) + 1)\n def test_case_5(self):\n # Test consistency across runs with built-in random seed\n myList = self.test_list\n ax1 = f_371(myList, 2)\n ax2 = f_371(myList, 2)\n colors1 = ax1.collections[0].get_array()\n colors2 = ax2.collections[0].get_array()\n self.assertTrue(all(c1 == c2 for c1, c2 in zip(colors1, colors2)))\n def tearDown(self):\n plt.close(\"all\")", "apis": ["sklearn.cluster.KMeans", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "sklearn"], "doc": {"description": ["Cluster a list of 2D points using KMeans and visualize the clusters."], "note": ["This function raises ValueError if it encounters invalid inputs.", "KMeans is performed with random_state = 42 and n_init = 10. Scatterplot", "uses red 'x' markers for cluster centers."], "params": ["myList (list): List of 2D points.", "n_clusters (int): Number of clusters to form."], "returns": ["matplotlib.axes._axes.Axes: Axes object with the plotted clusters."], "reqs": ["matplotlib.pyplot", "sklearn.cluster.KMeans"], "raises": [], "example": [">>> myList = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]", ">>> ax = f_371(myList, 2)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5'), Text(6.0, 0, '6'), Text(7.0, 0, '7'), Text(8.0, 0, '8'), Text(9.0, 0, '9'), Text(10.0, 0, '10')]"]}} +{"task_id": "f_329", "prompt": "import pandas as pd\nimport json\n\n\ndef f_329(data: dict, output_path: str = \"./default_data_output.json\") -> str:\n \"\"\"Converts the given DataFrame to a dictionary, dropping the column named 'c'\n if it exists, and then saves it as a JSON file.\n\n Parameters:\n - data (dict): The input data dictionary.\n - output_path (str, optional): The path where the JSON file should be saved. Default is './default_data_output.json'.\n\n Returns:\n - str: Path where the JSON file was saved.\n\n Requirements:\n - pandas\n - json\n\n Example:\n >>> f_329({'a': [1,2], 'b': [3,4], 'c': [5,6]})\n './default_data_output.json'\n >>> f_329({'a': [1,2], 'b': [3,4], 'c': [5,6]}, 'custom/path/results.json')\n 'custom/path/results.json'\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n # Drop column named 'c' if it exists\n df = df.drop(columns=\"c\", errors=\"ignore\")\n # Convert the DataFrame to dictionary\n data_dict = df.to_dict(orient=\"dict\")\n # Save the dictionary as a JSON file\n with open(output_path, \"w\") as file:\n json.dump(data_dict, file)\n\n return output_path", "test": "import unittest\nimport pandas as pd\nimport json\nimport os\nclass TestCases(unittest.TestCase):\n def read_json_file(self, path):\n # Helper function to read content from a JSON file\n with open(path, \"r\") as f:\n return json.load(f)\n def tearDown(self):\n # Cleanup procedure after each test to remove generated files\n files_to_remove = [\n \"./default_data_output.json\",\n \"./custom_data_output_2.json\",\n \"./custom_data_output_3.json\",\n \"./custom_data_output_4.json\",\n \"./custom_data_output_5.json\",\n ]\n for file in files_to_remove:\n if os.path.exists(file):\n os.remove(file)\n def convert_keys_to_str(self, dictionary):\n # Convert dictionary keys to strings recursively\n if not isinstance(dictionary, dict):\n return dictionary\n return {str(k): self.convert_keys_to_str(v) for k, v in dictionary.items()}\n def test_case_1(self):\n # Test basic DataFrame with column \"c\"\n data = {\"a\": [1, 2], \"b\": [3, 4], \"c\": [5, 6]}\n df = pd.DataFrame(data)\n output_path = f_329(data)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(\n df.drop(columns=\"c\").to_dict(orient=\"dict\")\n )\n self.assertEqual(self.read_json_file(output_path), expected_data)\n def test_case_2(self):\n # Test DataFrame with non-numeric data and column \"c\"\n data = {\"name\": [\"Alice\", \"Bob\"], \"country\": [\"USA\", \"Canada\"], \"c\": [\"x\", \"y\"]}\n df = pd.DataFrame(data)\n custom_path = \"./custom_data_output_2.json\"\n output_path = f_329(data, custom_path)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(\n df.drop(columns=\"c\").to_dict(orient=\"dict\")\n )\n self.assertEqual(self.read_json_file(output_path), expected_data)\n def test_case_3(self):\n # Test DataFrame with multiple columns and no column \"c\"\n data = {\"age\": [25, 30], \"height\": [170, 175]}\n df = pd.DataFrame(data)\n custom_path = \"./custom_data_output_3.json\"\n output_path = f_329(data, custom_path)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(df.to_dict(orient=\"dict\"))\n self.assertEqual(self.read_json_file(output_path), expected_data)\n def test_case_4(self):\n # Test DataFrame with mixed data types including column \"c\"\n data = {\n \"id\": [1, 2],\n \"is_student\": [True, False],\n \"grades\": [\"A\", \"B\"],\n \"c\": [0.5, 0.8],\n }\n df = pd.DataFrame(data)\n output_path = f_329(data)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(\n df.drop(columns=\"c\").to_dict(orient=\"dict\")\n )\n self.assertEqual(self.read_json_file(output_path), expected_data)\n def test_case_5(self):\n # Test an empty DataFrame\n data = {}\n df = pd.DataFrame(data)\n custom_path = \"./custom_data_output_5.json\"\n output_path = f_329(data, custom_path)\n self.assertTrue(os.path.exists(output_path))\n expected_data = self.convert_keys_to_str(df.to_dict(orient=\"dict\"))\n self.assertEqual(self.read_json_file(output_path), expected_data)", "apis": ["pandas.DataFrame", "json.dump"], "libs": ["json", "pandas"], "doc": {"description": ["Converts the given DataFrame to a dictionary, dropping the column named 'c'", "if it exists, and then saves it as a JSON file."], "note": [], "params": ["data (dict): The input data dictionary.", "output_path (str, optional): The path where the JSON file should be saved. Default is './default_data_output.json'."], "returns": ["str: Path where the JSON file was saved."], "reqs": ["pandas", "json"], "raises": [], "example": [">>> f_329({'a': [1,2], 'b': [3,4], 'c': [5,6]})", "'./default_data_output.json'", ">>> f_329({'a': [1,2], 'b': [3,4], 'c': [5,6]}, 'custom/path/results.json')", "'custom/path/results.json'"]}} +{"task_id": "f_380", "prompt": "import pandas as pd\nimport random\nimport re\n\n\ndef f_380(data_list, seed=None):\n \"\"\"\n Apply a random operation (remove, replace, shuffle, or randomize) to substrings in a list of strings.\n\n This function processes a list of comma-separated strings by applying one of four random operations to\n their substrings: remove, replace, shuffle, or randomize. Here, a substring refers to the individual\n items in the string that are separated by commas, sensitive to leading/trailing whitespace, i.e.\n 'apple' != 'apple ', and sensitive to case, i.e. 'APPLE' != 'aPPLE'.\n\n The choice of operation and the substrings it affects are determined randomly. The operations are:\n - Remove: Randomly selects and removes a substring.\n If a string contains only one substring, no 'remove' operation is applied.\n - Replace: Randomly selects a substring and replaces it with 'random_string'.\n - Shuffle: Randomly shuffles the order of the substrings.\n - Randomize: Assigns a new, random order to the substrings.\n\n Finally, the function returns a DataFrame with column 'Original String' containing the input strings\n and the 'Modified String' column containing the strings after applying the random operation.\n\n Parameters:\n - data_list (list): The list of strings. If empty, function will return a DataFrame with the expected\n columns that is otherwise empty.\n - seed (int, optional): A seed for the random operations to ensure reproducibility. Default is None.\n\n Returns:\n df (pd.DataFrame): DataFrame containing original and modified strings.\n\n Requirements:\n - pandas\n - random\n - re\n\n Example:\n >>> f_380(['lamp, bag, mirror', 'table, chair, bag, lamp'], seed=0)\n Original String Modified String\n 0 lamp, bag, mirror bag, lamp, mirror\n 1 table, chair, bag, lamp lamp, chair, bag, table\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n df = pd.DataFrame(data_list, columns=[\"Original String\"])\n\n modified_strings = []\n for s in data_list:\n substrings = re.split(\", \", s)\n operation = random.choice([\"remove\", \"replace\", \"shuffle\", \"randomize\"])\n if operation == \"remove\":\n if len(substrings) > 1:\n random_substring = random.choice(substrings)\n substrings.remove(random_substring)\n modified_s = \", \".join(substrings)\n else:\n modified_s = s\n elif operation == \"replace\":\n random_substring_index = random.choice(range(len(substrings)))\n substrings[random_substring_index] = \"random_string\"\n modified_s = \", \".join(substrings)\n elif operation == \"shuffle\":\n random.shuffle(substrings)\n modified_s = \", \".join(substrings)\n elif operation == \"randomize\":\n random_positions = random.sample(range(len(substrings)), len(substrings))\n modified_s = \", \".join([substrings[i] for i in random_positions])\n modified_strings.append(modified_s)\n\n df[\"Modified String\"] = modified_strings\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n default_seed = 42\n def test_case_1(self):\n # Test basic functionality\n data_list = [\"lamp, bag, mirror\", \"table, chair, bag, lamp\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(result[\"Original String\"].tolist(), data_list)\n self.assertNotEqual(result[\"Original String\"][0], result[\"Modified String\"][0])\n self.assertNotEqual(result[\"Original String\"][1], result[\"Modified String\"][1])\n def test_case_2(self):\n # Test single string\n data_list = [\"apple, orange, banana\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(result[\"Original String\"].tolist(), data_list)\n self.assertNotEqual(result[\"Original String\"][0], result[\"Modified String\"][0])\n def test_case_3(self):\n # Test single character\n data_list = [\"a, b, c\", \"d, e, f\", \"g, h, i\", \"j, k, l\", \"m, n, o\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(result[\"Original String\"].tolist(), data_list)\n for idx in range(len(data_list)):\n self.assertNotEqual(\n result[\"Original String\"][idx], result[\"Modified String\"][idx]\n )\n def test_case_4(self):\n # Test whitespace sensitivity\n data_list = [\"apple, apple, apple \", \" apple, apple , apple \"]\n result = f_380(data_list, seed=self.default_seed)\n modified_strings = result[\"Modified String\"].tolist()\n self.assertTrue(\n all(\n original != modified\n for original, modified in zip(data_list, modified_strings)\n ),\n \"The function should treat substrings differently based on whitespace.\",\n )\n def test_case_5(self):\n # Test case sensitivity\n data_list = [\"apple, Apple\", \"APPLE, apple\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(result[\"Original String\"].tolist(), data_list)\n # Checking that modifications respect case sensitivity\n self.assertNotEqual(result[\"Modified String\"][0], result[\"Modified String\"][1])\n def test_case_6(self):\n # Test same random seed produces same results\n data_list = [\"lamp, bag, mirror\", \"table, chair, bag, lamp\"]\n result1 = f_380(data_list, seed=self.default_seed)\n result2 = f_380(data_list, seed=self.default_seed)\n pd.testing.assert_frame_equal(result1, result2)\n def test_case_7(self):\n # Test function integrity by calculating expected results with fixed random seed\n data_list = [\"a, b, c\", \"d, e, f\"]\n expected_modifications = [\"b, c\", \"e, f, d\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(\n result[\"Modified String\"].tolist(),\n expected_modifications,\n \"With a fixed seed, the modifications should be predictable and reproducible.\",\n )\n def test_case_8(self):\n # Test invalid input handling\n for invalid_data_list in [\n [1, 2, 3],\n [None, \"apple\"],\n [None, None],\n [1, \"orange\", 3],\n ]:\n with self.assertRaises(TypeError):\n f_380(invalid_data_list, seed=self.default_seed)\n def test_case_9(self):\n # Test empty list input\n data_list = []\n result = f_380(data_list, seed=self.default_seed)\n self.assertTrue(\n result.empty,\n \"The result should be an empty DataFrame for an empty input list.\",\n )\n def test_case_10(self):\n # Test input list with an empty string\n data_list = [\"\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(\n result[\"Modified String\"].tolist(),\n [\"\"],\n \"An empty string should remain unchanged.\",\n )\n def test_case_11(self):\n # Test input with a single substring (no commas)\n data_list = [\"single\"]\n result = f_380(data_list, seed=self.default_seed)\n self.assertEqual(\n result[\"Modified String\"].tolist(),\n [\"single\"],\n \"A single substring should remain unchanged.\",\n )", "apis": ["pandas.DataFrame", "re.split", "random.shuffle", "random.seed", "random.choice", "random.sample"], "libs": ["random", "re", "pandas"], "doc": {"description": ["Apply a random operation (remove, replace, shuffle, or randomize) to substrings in a list of strings.", "This function processes a list of comma-separated strings by applying one of four random operations to", "their substrings: remove, replace, shuffle, or randomize. Here, a substring refers to the individual", "items in the string that are separated by commas, sensitive to leading/trailing whitespace, i.e.", "'apple' != 'apple ', and sensitive to case, i.e. 'APPLE' != 'aPPLE'.", "The choice of operation and the substrings it affects are determined randomly. The operations are:", "- Remove: Randomly selects and removes a substring.", "If a string contains only one substring, no 'remove' operation is applied.", "- Replace: Randomly selects a substring and replaces it with 'random_string'.", "- Shuffle: Randomly shuffles the order of the substrings.", "- Randomize: Assigns a new, random order to the substrings.", "Finally, the function returns a DataFrame with column 'Original String' containing the input strings", "and the 'Modified String' column containing the strings after applying the random operation."], "note": [], "params": ["data_list (list): The list of strings. If empty, function will return a DataFrame with the expected", "columns that is otherwise empty.", "seed (int, optional): A seed for the random operations to ensure reproducibility. Default is None."], "returns": ["df (pd.DataFrame): DataFrame containing original and modified strings."], "reqs": ["pandas", "random", "re"], "raises": [], "example": [">>> f_380(['lamp, bag, mirror', 'table, chair, bag, lamp'], seed=0)", "Original String Modified String", "0 lamp, bag, mirror bag, lamp, mirror", "1 table, chair, bag, lamp lamp, chair, bag, table"]}} +{"task_id": "f_371", "prompt": "import matplotlib.pyplot as plt\nfrom sklearn.cluster import KMeans\n\n\ndef f_371(myList, n_clusters):\n \"\"\"\n Cluster a list of 2D points using KMeans and visualize the clusters.\n\n Note: This function raises ValueError if it encounters invalid inputs.\n KMeans is performed with random_state = 42 and n_init = 10. Scatterplot\n uses red 'x' markers for cluster centers.\n\n Parameters:\n - myList (list): List of 2D points.\n - n_clusters (int): Number of clusters to form.\n\n Returns:\n - matplotlib.axes._axes.Axes: Axes object with the plotted clusters.\n\n Requirements:\n - matplotlib.pyplot\n - sklearn.cluster.KMeans\n\n Example:\n >>> myList = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]\n >>> ax = f_371(myList, 2)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5'), Text(6.0, 0, '6'), Text(7.0, 0, '7'), Text(8.0, 0, '8'), Text(9.0, 0, '9'), Text(10.0, 0, '10')]\n \"\"\"", "canonical_solution": " if not myList or n_clusters <= 0:\n raise ValueError(\"Invalid inputs\")\n\n kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)\n kmeans.fit(myList)\n\n fig, ax = plt.subplots()\n ax.scatter(*zip(*myList), c=kmeans.labels_)\n ax.scatter(*zip(*kmeans.cluster_centers_), marker=\"x\", color=\"red\")\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.test_list = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]\n def test_case_1(self):\n # Test single cluster\n myList = [[1, 1], [1, 1], [1, 1], [1, 1]]\n ax = f_371(myList, 1)\n self.assertEqual(len(set(ax.collections[0].get_array())), 1)\n def test_case_2(self):\n # Test arbitrary number of clusters\n myList = self.test_list\n for n in range(1, 6):\n ax = f_371(myList, n)\n self.assertEqual(len(set(ax.collections[0].get_array())), n)\n def test_case_3(self):\n # Test visualization\n myList = self.test_list\n ax = f_371(myList, 2)\n red_collection = next(\n coll\n for coll in ax.collections\n if (\n coll.get_facecolor()[0][0] == 1.0\n and coll.get_facecolor()[0][1] == 0.0\n and coll.get_facecolor()[0][2] == 0.0\n )\n )\n red_x_markers_count = len(red_collection.get_offsets())\n self.assertEqual(red_x_markers_count, 2)\n def test_case_4(self):\n # Test handling invalid inputs\n with self.assertRaises(ValueError):\n f_371([], 1)\n with self.assertRaises(ValueError):\n f_371([[1, 1], [2, 2]], 0)\n with self.assertRaises(ValueError):\n f_371(self.test_list, len(self.test_list) + 1)\n def test_case_5(self):\n # Test consistency across runs with built-in random seed\n myList = self.test_list\n ax1 = f_371(myList, 2)\n ax2 = f_371(myList, 2)\n colors1 = ax1.collections[0].get_array()\n colors2 = ax2.collections[0].get_array()\n self.assertTrue(all(c1 == c2 for c1, c2 in zip(colors1, colors2)))\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "sklearn.cluster.KMeans"], "libs": ["sklearn", "matplotlib"], "doc": {"description": ["Cluster a list of 2D points using KMeans and visualize the clusters."], "note": ["This function raises ValueError if it encounters invalid inputs.", "KMeans is performed with random_state = 42 and n_init = 10. Scatterplot", "uses red 'x' markers for cluster centers."], "params": ["myList (list): List of 2D points.", "n_clusters (int): Number of clusters to form."], "returns": ["matplotlib.axes._axes.Axes: Axes object with the plotted clusters."], "reqs": ["matplotlib.pyplot", "sklearn.cluster.KMeans"], "raises": [], "example": [">>> myList = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]", ">>> ax = f_371(myList, 2)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5'), Text(6.0, 0, '6'), Text(7.0, 0, '7'), Text(8.0, 0, '8'), Text(9.0, 0, '9'), Text(10.0, 0, '10')]"]}} {"task_id": "f_776", "prompt": "from collections import Counter\n\ndef f_776(word: str) -> dict:\n \"\"\"\n Count the occurrence of each adjacent pair of letters in a word.\n\n Functionality:\n This function counts the occurrences of each adjacent pair of letters in a given word.\n\n Parameters:\n - word (str): The word in which to count the adjacent letter pairs.\n\n Returns:\n - dict: A dictionary where keys are adjacent letter pairs and values are their counts.\n\n Required Libraries:\n - collections.Counter\n\n Examples:\n >>> f_776('abracadabra')\n {'ab': 2, 'br': 2, 'ra': 2, 'ac': 1, 'ca': 1, 'ad': 1, 'da': 1}\n >>> f_776('hello')\n {'he': 1, 'el': 1, 'll': 1, 'lo': 1}\n \"\"\"", "canonical_solution": " pairs = list(map(''.join, zip(word[:-1], word[1:])))\n pairs_count = dict(Counter(pairs))\n return pairs_count", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with the word 'abracadabra'\n result = f_776('abracadabra')\n expected = {'ab': 2, 'br': 2, 'ra': 2, 'ac': 1, 'ca': 1, 'ad': 1, 'da': 1} # Corrected this line\n self.assertEqual(result, expected)\n def test_case_2(self):\n # Test with the word 'hello'\n result = f_776('hello')\n expected = {'he': 1, 'el': 1, 'll': 1, 'lo': 1}\n self.assertEqual(result, expected)\n def test_case_3(self):\n # Test with the word 'python'\n result = f_776('python')\n expected = {'py': 1, 'yt': 1, 'th': 1, 'ho': 1, 'on': 1}\n self.assertEqual(result, expected)\n def test_case_4(self):\n # Test with an empty string\n result = f_776('')\n expected = {}\n self.assertEqual(result, expected)\n def test_case_5(self):\n # Test with a single character string\n result = f_776('a')\n expected = {}\n self.assertEqual(result, expected)", "apis": ["collections.Counter"], "libs": ["collections"], "doc": {"description": ["Count the occurrence of each adjacent pair of letters in a word.", "Functionality:", "This function counts the occurrences of each adjacent pair of letters in a given word.", "Required Libraries:", "- collections.Counter"], "note": [], "params": ["word (str): The word in which to count the adjacent letter pairs."], "returns": ["dict: A dictionary where keys are adjacent letter pairs and values are their counts."], "reqs": [], "raises": [], "example": ["Examples:", ">>> f_776('abracadabra')", "{'ab': 2, 'br': 2, 'ra': 2, 'ac': 1, 'ca': 1, 'ad': 1, 'da': 1}", ">>> f_776('hello')", "{'he': 1, 'el': 1, 'll': 1, 'lo': 1}"]}} -{"task_id": "f_611", "prompt": "import json\nimport csv\n\ndef f_611(json_file, csv_file):\n \"\"\"\n Convert a JSON file to CSV.\n \n Parameters:\n - json_file (str): The path to the JSON file.\n - csv_file (str): The path to the CSV file.\n\n Returns:\n - csv_file: The function returns the path to the CSV file that was written.\n\n Requirements:\n - json\n - csv\n \n Example:\n >>> f_611('path_to_json_file.json', 'path_to_csv_file.csv')\n 'path_to_csv_file.csv'\n \"\"\"", "canonical_solution": " with open(json_file, 'r') as f:\n data = json.load(f)\n\n with open(csv_file, 'w') as f:\n writer = csv.writer(f)\n writer.writerow(data.keys())\n writer.writerow(data.values())\n \n return csv_file", "test": "import unittest\nimport os\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Create json file\n json_file = './test.json'\n with open(json_file, 'w') as f:\n json.dump({'a': 1, 'b': 2, 'c': 3}, f)\n # Run function\n csv_file = f_611(json_file, './test.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['a', 'b', 'c'], ['1', '2', '3']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)\n def test_case_2(self):\n # Create json file\n json_file = './test.json'\n with open(json_file, 'w') as f:\n json.dump({'z': 1, 'y': 2, 'x': 3}, f)\n # Run function\n csv_file = f_611(json_file, './test.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['z', 'y', 'x'], ['1', '2', '3']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)\n def test_case_3(self):\n # Create json file\n json_file = './testx.json'\n with open(json_file, 'w') as f:\n json.dump({'xxx': 99}, f)\n # Run function\n csv_file = f_611(json_file, './testx.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['xxx'], ['99']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)\n def test_case_4(self):\n # Create json file\n json_file = './testy.json'\n with open(json_file, 'w') as f:\n json.dump({'yyy': 99}, f)\n # Run function\n csv_file = f_611(json_file, './testy.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['yyy'], ['99']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)\n def test_case_5(self):\n # Create json file\n json_file = './testz.json'\n with open(json_file, 'w') as f:\n json.dump({'zzz': 99}, f)\n # Run function\n csv_file = f_611(json_file, './testz.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['zzz'], ['99']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)", "apis": ["json.load", "csv.writer"], "libs": ["csv", "json"], "doc": {"description": ["Convert a JSON file to CSV."], "note": [], "params": ["json_file (str): The path to the JSON file.", "csv_file (str): The path to the CSV file."], "returns": ["csv_file: The function returns the path to the CSV file that was written."], "reqs": ["json", "csv"], "raises": [], "example": [">>> f_611('path_to_json_file.json', 'path_to_csv_file.csv')", "'path_to_csv_file.csv'"]}} -{"task_id": "f_342", "prompt": "import pickle\nimport os\nimport pandas as pd\nimport numpy as np\n\n\ndef f_342(df, file_name=\"save.pkl\"):\n \"\"\"\n Save the provided Pandas DataFrame \"df\" in a pickle file with the given name, read it\n back for validation, and delete the intermediate file.\n\n Parameters:\n df (DataFrame): The pandas DataFrame to be saved.\n file_name (str, optional): Name of the file where the DataFrame will be saved. Defaults to 'save.pkl'.\n\n Returns:\n loaded_df (pd.DataFrame): The loaded DataFrame from the specified file.\n\n Requirements:\n - pickle\n - os\n\n Example:\n >>> np.random.seed(0)\n >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n >>> loaded_df = f_342(df, 'test_file.pkl')\n >>> assert df.equals(loaded_df)\n >>> type(df), type(loaded_df)\n (, )\n >>> df.head(2)\n A B C D\n 0 44 47 64 67\n 1 67 9 83 21\n \"\"\"", "canonical_solution": " with open(file_name, \"wb\") as file:\n pickle.dump(df, file)\n\n with open(file_name, \"rb\") as file:\n loaded_df = pickle.load(file)\n\n os.remove(file_name)\n\n return loaded_df", "test": "import unittest\nimport os\nimport pandas as pd\nimport numpy as np\nimport tempfile\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n def tearDown(self):\n self.temp_dir.cleanup()\n def test_case_1(self):\n # Test with random integers\n df = pd.DataFrame(\n np.random.randint(0, 100, size=(100, 4)), columns=list(\"ABCD\")\n )\n file_path = os.path.join(self.temp_dir.name, \"test.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_2(self):\n # Test with floats\n df = pd.DataFrame(np.random.rand(50, 3), columns=list(\"XYZ\"))\n file_path = os.path.join(self.temp_dir.name, \"floats.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_3(self):\n # Test with strings\n df = pd.DataFrame({\"A\": [\"foo\", \"bar\", \"baz\"], \"B\": [\"qux\", \"quux\", \"corge\"]})\n file_path = os.path.join(self.temp_dir.name, \"strings.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_4(self):\n # Test with empty dataframe\n df = pd.DataFrame()\n file_path = os.path.join(self.temp_dir.name, \"empty.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_5(self):\n # Test with datetime\n df = pd.DataFrame(\n {\"Date\": [datetime(2020, 1, 1), datetime(2020, 1, 2)], \"Value\": [10, 20]}\n )\n file_path = os.path.join(self.temp_dir.name, \"datetime.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_6(self):\n # Test larger dataframe\n df = pd.DataFrame(\n np.random.randint(0, 100, size=(10000, 10)),\n columns=[f\"Col{i}\" for i in range(10)],\n )\n file_path = os.path.join(self.temp_dir.name, \"large.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_7(self):\n # Test single entry dataframe\n df = pd.DataFrame({\"Single\": [42]})\n file_path = os.path.join(self.temp_dir.name, \"test_file_small.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(\n df.equals(loaded_df), \"Loaded DataFrame does not match the original.\"\n )\n self.assertFalse(os.path.exists(file_path))", "apis": ["pickle.dump", "os.remove", "pickle.load"], "libs": ["pickle", "os"], "doc": {"description": ["Save the provided Pandas DataFrame \"df\" in a pickle file with the given name, read it", "back for validation, and delete the intermediate file."], "note": [], "params": ["df (DataFrame): The pandas DataFrame to be saved.", "file_name (str, optional): Name of the file where the DataFrame will be saved. Defaults to 'save.pkl'."], "returns": ["loaded_df (pd.DataFrame): The loaded DataFrame from the specified file."], "reqs": ["pickle", "os"], "raises": [], "example": [">>> np.random.seed(0)", ">>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))", ">>> loaded_df = f_342(df, 'test_file.pkl')", ">>> assert df.equals(loaded_df)", ">>> type(df), type(loaded_df)", "(, )", ">>> df.head(2)", "A B C D", "0 44 47 64 67", "1 67 9 83 21"]}} -{"task_id": "f_898", "prompt": "import csv\nimport numpy as np\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\n\n\ndef f_898(file_path):\n \"\"\"\n This function processes a CSV file containing numeric data representing a population. It randomly\n selects 30 individuals from this population without replacement to form a sample. The function\n calculates the mean and standard deviation of this sample. The means delta degree is 1. It also generates a histogram of the\n sample data and overlays a normal distribution curve on this histogram.\n\n Parameters:\n - file_path (str): A string representing the path to the CSV file. Each line in the file should contain\n a single numeric value representing an individual in the population.\n\n Returns:\n - Tuple (float, float, matplotlib.axes._subplots.AxesSubplot): The function returns a tuple containing\n three elements:\n - Sample mean (float): The mean of the sample.\n - Sample standard deviation (float): The standard deviation of the sample, calculated with a\n degrees of freedom (ddof) of 1.\n - Matplotlib subplot (matplotlib.axes._subplots.AxesSubplot): An object representing the\n generated histogram plot with the normal distribution curve.\n\n Requirements:\n - csv\n - numpy\n - scipy\n - matplotlib\n\n Notes:\n - The function uses numpy for random sampling and statistical calculations.\n - The matplotlib library is used to plot the histogram and the normal distribution curve.\n - The function includes exception handling for file input/output errors, ensuring that any issues\n with reading the CSV file are properly communicated.\n - The function plots a histogram of the sample using matplotlib, with the number of bins\n determined automatically ('auto').\n\n Example:\n >>> mean, std_dev, ax = f_898('population_data.csv')\n >>> print(mean, std_dev)\n (50.5, 29.011491975882016)\n\n In this example, 'population_data.csv' is a CSV file where each line contains a numeric value. The\n function reads this file, samples 30 values, computes their mean and standard deviation, and plots\n a histogram with a normal distribution curve.\n \"\"\"", "canonical_solution": " try:\n with open(file_path, \"r\", encoding=\"utf-8\") as file:\n reader = csv.reader(file)\n population = [int(row[0]) for row in reader]\n except IOError as exc:\n raise IOError(\n \"Error reading the file. Please check the file path and permissions.\"\n ) from exc\n\n sample = np.random.choice(population, 30, replace=False)\n mean = np.mean(sample)\n std_dev = np.std(sample, ddof=1)\n\n plt.hist(sample, bins=\"auto\", density=True, alpha=0.7, rwidth=0.85)\n xmin, xmax = plt.xlim()\n x = np.linspace(xmin, xmax, 100)\n p = stats.norm.pdf(x, mean, std_dev)\n plt.plot(x, p, \"k\", linewidth=2)\n plt.xlabel(\"Sample Values\")\n plt.ylabel(\"Frequency\")\n plt.title(\"Sample Histogram with Normal Distribution Overlay\")\n ax = plt.gca()\n\n return mean, std_dev, ax", "test": "import unittest\nfrom unittest.mock import patch, mock_open\nimport matplotlib\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_898.\"\"\"\n def setUp(self):\n \"\"\"Set up the test environment.\"\"\"\n matplotlib.use(\"Agg\")\n def test_valid_csv_file(self):\n \"\"\"Test with a valid CSV file.\"\"\"\n mock_data = \"1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n10\\n11\\n12\\n13\\n14\\n15\\n16\\n17\\n18\\n19\\n20\\n21\\n22\\n23\\n24\\n25\\n26\\n27\\n28\\n29\\n30\\n31\"\n with patch(\"builtins.open\", mock_open(read_data=mock_data)):\n mean, std_dev, ax = f_898(\"dummy_path\")\n self.assertIsNotNone(mean)\n self.assertIsNotNone(std_dev)\n def test_empty_csv_file(self):\n \"\"\"Test with an empty CSV file.\"\"\"\n mock_data = \"\"\n with patch(\"builtins.open\", mock_open(read_data=mock_data)), self.assertRaises(\n ValueError\n ):\n f_898(\"dummy_path\")\n def test_non_existent_file(self):\n \"\"\"Test with a non-existent file path.\"\"\"\n with self.assertRaises(IOError):\n f_898(\"non_existent_path.csv\")\n def test_csv_with_non_numeric_data(self):\n \"\"\"Test with a CSV file containing non-numeric data.\"\"\"\n mock_data = \"a\\nb\\nc\\nd\\ne\"\n with patch(\"builtins.open\", mock_open(read_data=mock_data)), self.assertRaises(\n ValueError\n ):\n f_898(\"dummy_path\")\n def test_small_population_size(self):\n \"\"\"Test with a small population size.\"\"\"\n mock_data = \"1\\n2\\n3\\n4\\n5\"\n with patch(\"builtins.open\", mock_open(read_data=mock_data)), self.assertRaises(\n ValueError\n ):\n f_898(\"dummy_path\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["scipy.stats.norm.pdf", "numpy.mean", "numpy.std", "numpy.random", "csv.reader", "scipy.stats.norm", "matplotlib.pyplot.plot", "matplotlib.pyplot.hist", "matplotlib.pyplot.xlabel", "matplotlib.pyplot.xlim", "matplotlib.pyplot.title", "numpy.random.choice", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "numpy.linspace"], "libs": ["numpy", "matplotlib", "csv", "scipy"], "doc": {"description": ["This function processes a CSV file containing numeric data representing a population. It randomly", "selects 30 individuals from this population without replacement to form a sample. The function", "calculates the mean and standard deviation of this sample. The means delta degree is 1. It also generates a histogram of the", "sample data and overlays a normal distribution curve on this histogram.", "Notes:", "- The function uses numpy for random sampling and statistical calculations.", "- The matplotlib library is used to plot the histogram and the normal distribution curve.", "- The function includes exception handling for file input/output errors, ensuring that any issues", "with reading the CSV file are properly communicated.", "- The function plots a histogram of the sample using matplotlib, with the number of bins", "determined automatically ('auto').", "In this example, 'population_data.csv' is a CSV file where each line contains a numeric value. The", "function reads this file, samples 30 values, computes their mean and standard deviation, and plots", "a histogram with a normal distribution curve."], "note": [], "params": ["file_path (str): A string representing the path to the CSV file. Each line in the file should contain", "a single numeric value representing an individual in the population."], "returns": ["Tuple (float, float, matplotlib.axes._subplots.AxesSubplot): The function returns a tuple containing", "three elements:", "Sample mean (float): The mean of the sample.", "Sample standard deviation (float): The standard deviation of the sample, calculated with a", "degrees of freedom (ddof) of 1.", "Matplotlib subplot (matplotlib.axes._subplots.AxesSubplot): An object representing the", "generated histogram plot with the normal distribution curve."], "reqs": ["csv", "numpy", "scipy", "matplotlib"], "raises": [], "example": [">>> mean, std_dev, ax = f_898('population_data.csv')", ">>> print(mean, std_dev)", "(50.5, 29.011491975882016)"]}} -{"task_id": "f_538", "prompt": "import numpy as np\nimport pandas as pd\nfrom scipy.stats import linregress\n\n\ndef f_538(df):\n \"\"\"\n Analyze the relationship between two variables in a DataFrame.\n The function performs a linear regression on the two variables and adds a 'predicted' column to the DataFrame.\n\n Parameters:\n - df (pandas.DataFrame): The input DataFrame with columns 'var1', 'var2'.\n \n Returns:\n - df (pandas.DataFrame): The DataFrame with the added 'predicted' column.\n\n Requirements:\n - numpy\n - pandas\n - scipy\n\n Example:\n >>> df = pd.DataFrame({'var1': np.random.randn(10),\n ... 'var2': np.random.randn(10)})\n >>> df = f_538(df)\n >>> assert 'predicted' in df.columns\n >>> assert len(df) == 10\n >>> assert len(df.columns) == 3\n \"\"\"", "canonical_solution": " \n regression = linregress(df['var1'], df['var2'])\n \n # Explicit use of np.array to demonstrate the np. prefix usage\n # This step is purely illustrative and may not be necessary for this specific logic\n predictions = np.array(regression.slope) * np.array(df['var1']) + np.array(regression.intercept)\n \n df['predicted'] = pd.Series(predictions, index=df.index)\n\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'var1': np.random.randn(10),\n 'var2': np.random.randn(10)})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 10)\n self.assertEqual(len(df.columns), 3)\n def test_case_2(self):\n df = pd.DataFrame({'var1': [1, 2, 3, 4, 5],\n 'var2': [1, 2, 3, 4, 5]})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 5)\n self.assertEqual(len(df.columns), 3)\n self.assertTrue(np.all(df['predicted'] == df['var2']))\n \n def test_case_3(self):\n df = pd.DataFrame({'var1': [1, 2, 3, 4, 5],\n 'var2': [5, 4, 3, 2, 1]})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 5)\n self.assertEqual(len(df.columns), 3)\n self.assertTrue(np.all(df['predicted'] == df['var2']))\n def test_case_4(self):\n df = pd.DataFrame({'var1': [1, 2, 3, 4, 5],\n 'var2': [1, 1, 1, 1, 1]})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 5)\n self.assertEqual(len(df.columns), 3)\n self.assertTrue(np.all(df['predicted'] == df['var2']))\n def test_case_5(self):\n df = pd.DataFrame({'var1': [0, 1, 2, 3, 4, 5],\n 'var2': [1, 1, 1, 1, 1, 1]})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 6)\n self.assertEqual(len(df.columns), 3)\n self.assertTrue(np.all(df['predicted'] == df['var2']))", "apis": ["scipy.stats.linregress", "pandas.Series", "numpy.array"], "libs": ["numpy", "pandas", "scipy"], "doc": {"description": ["Analyze the relationship between two variables in a DataFrame.", "The function performs a linear regression on the two variables and adds a 'predicted' column to the DataFrame."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame with columns 'var1', 'var2'."], "returns": ["df (pandas.DataFrame): The DataFrame with the added 'predicted' column."], "reqs": ["numpy", "pandas", "scipy"], "raises": [], "example": [">>> df = pd.DataFrame({'var1': np.random.randn(10),", "... 'var2': np.random.randn(10)})", ">>> df = f_538(df)", ">>> assert 'predicted' in df.columns", ">>> assert len(df) == 10", ">>> assert len(df.columns) == 3"]}} -{"task_id": "f_862", "prompt": "from PIL import Image\nimport codecs\nimport pytesseract\n\n\nIMAGE_PATH = \"image.png\"\n\n\ndef f_862(filename=IMAGE_PATH, from_encoding=\"cp1251\", to_encoding=\"utf8\"):\n \"\"\"\n Opens an image file, extracts text using OCR, and converts the text encoding, with a fallback to image comment processing.\n\n Raises:\n - ValueError: UnicodeDecodeError or LookupError occurs during conversion\n\n Parameters:\n - filename (str): The path to the image file. Defaults to a global variable 'IMAGE_PATH'.\n - from_encoding (str): The original encoding of the extracted text or image comment. Default is 'cp1251'.\n - to_encoding (str): The target encoding for the converted text or comment. Default is 'utf8'.\n\n Returns:\n - comment (str): The text extracted from the image or the image comment, converted to the target encoding.\n If OCR extraction and comment processing both fail, returns an empty string.\n\n Raises:\n - ValueError: If incorrect encodings are provided for the text or comment conversion.\n\n Requirements:\n - codecs\n - PIL\n - pytesseract\n\n Example:\n # Assuming 'image.png' contains the text '\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440' in Russian (encoded in cp1251),\n # and this text is successfully extracted by the OCR.\n >>> text = f_862('image.png', 'cp1251', 'utf8')\n >>> print(text)\n '\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440' # This output is the utf-8 encoded version of the extracted text.\n \"\"\"", "canonical_solution": " with Image.open(filename) as image:\n try:\n extracted_text = pytesseract.image_to_string(image)\n if extracted_text:\n try:\n return extracted_text.encode(from_encoding).decode(to_encoding)\n except (UnicodeDecodeError, LookupError) as exc:\n raise ValueError(\"Incorrect encoding provided.\") from exc\n except Exception:\n # If OCR fails, fall back to processing the image comment\n pass\n\n comment = image.info.get(\"comment\", \"\")\n if isinstance(comment, bytes):\n try:\n return (\n codecs.decode(comment, from_encoding)\n .encode(to_encoding)\n .decode(to_encoding)\n )\n except (UnicodeDecodeError, LookupError) as exc:\n raise ValueError(\"Incorrect encoding provided.\") from exc\n\n return comment", "test": "import unittest\nfrom unittest.mock import patch, Mock\nfrom PIL import Image\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_862 function.\"\"\"\n def setUp(self):\n self.mock_image = Mock()\n self.mock_image.info.get.return_value = b\"Mocked Comment in cp1251\"\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\")\n def test_successful_ocr_extraction_and_encoding(self, mock_ocr, mock_open):\n \"\"\"Test with successful OCR text extraction and encoding conversion.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n mock_ocr.return_value = \"Extracted Text in cp1251\"\n result = f_862(\"dummy_path\", \"cp1251\", \"utf8\")\n self.assertEqual(result, \"Extracted Text in cp1251\")\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\", side_effect=Exception)\n def test_ocr_fails_comment_extraction_succeeds(self, mock_ocr, mock_open):\n \"\"\"Test OCR fails, but comment extraction and encoding conversion succeed.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n # Mocked comment in cp1251 encoding\n self.mock_image.info.get.return_value = \"Mocked Comment in cp1251\".encode(\n \"cp1251\"\n )\n result = f_862(\"dummy_path\", \"cp1251\", \"utf8\")\n # Expected result after converting the mocked comment from cp1251 to utf8\n expected_result = \"Mocked Comment in cp1251\".encode(\"cp1251\").decode(\"utf8\")\n self.assertEqual(result, expected_result)\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\")\n def test_ocr_succeeds_encoding_fails(self, mock_ocr, mock_open):\n \"\"\"Test OCR text extraction succeeds, but encoding conversion fails.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n mock_ocr.return_value = \"Extracted Text in wrong encoding\"\n with self.assertRaises(ValueError):\n f_862(\"dummy_path\", \"invalid_encoding\", \"utf8\")\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\", side_effect=Exception)\n def test_ocr_and_comment_extraction_fail(self, mock_ocr, mock_open):\n \"\"\"Test both OCR and comment extraction fail.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n self.mock_image.info.get.return_value = \"\" # No comment in metadata\n result = f_862(\"dummy_path\")\n self.assertEqual(result, \"\")\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\")\n def test_ocr_extraction_succeeds_no_encoding_needed(self, mock_ocr, mock_open):\n \"\"\"Test OCR extraction succeeds, no encoding conversion needed.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n mock_ocr.return_value = \"Extracted Text already in utf8\"\n result = f_862(\"dummy_path\", \"utf8\", \"utf8\")\n self.assertEqual(result, \"Extracted Text already in utf8\")", "apis": ["pytesseract.image_to_string", "codecs.decode", "PIL.Image.open"], "libs": ["pytesseract", "PIL", "codecs"], "doc": {"description": ["Opens an image file, extracts text using OCR, and converts the text encoding, with a fallback to image comment processing."], "note": [], "params": ["filename (str): The path to the image file. Defaults to a global variable 'IMAGE_PATH'.", "from_encoding (str): The original encoding of the extracted text or image comment. Default is 'cp1251'.", "to_encoding (str): The target encoding for the converted text or comment. Default is 'utf8'."], "returns": ["comment (str): The text extracted from the image or the image comment, converted to the target encoding.", "If OCR extraction and comment processing both fail, returns an empty string."], "reqs": ["codecs", "PIL", "pytesseract"], "raises": ["ValueError: UnicodeDecodeError or LookupError occurs during conversion", "ValueError: If incorrect encodings are provided for the text or comment conversion."], "example": ["# Assuming 'image.png' contains the text '\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440' in Russian (encoded in cp1251),", "# and this text is successfully extracted by the OCR.", ">>> text = f_862('image.png', 'cp1251', 'utf8')", ">>> print(text)", "'\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440' # This output is the utf-8 encoded version of the extracted text."]}} +{"task_id": "f_611", "prompt": "import json\nimport csv\n\ndef f_611(json_file, csv_file):\n \"\"\"\n Convert a JSON file to CSV.\n \n Parameters:\n - json_file (str): The path to the JSON file.\n - csv_file (str): The path to the CSV file.\n\n Returns:\n - csv_file: The function returns the path to the CSV file that was written.\n\n Requirements:\n - json\n - csv\n \n Example:\n >>> f_611('path_to_json_file.json', 'path_to_csv_file.csv')\n 'path_to_csv_file.csv'\n \"\"\"", "canonical_solution": " with open(json_file, 'r') as f:\n data = json.load(f)\n\n with open(csv_file, 'w') as f:\n writer = csv.writer(f)\n writer.writerow(data.keys())\n writer.writerow(data.values())\n \n return csv_file", "test": "import unittest\nimport os\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Create json file\n json_file = './test.json'\n with open(json_file, 'w') as f:\n json.dump({'a': 1, 'b': 2, 'c': 3}, f)\n # Run function\n csv_file = f_611(json_file, './test.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['a', 'b', 'c'], ['1', '2', '3']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)\n def test_case_2(self):\n # Create json file\n json_file = './test.json'\n with open(json_file, 'w') as f:\n json.dump({'z': 1, 'y': 2, 'x': 3}, f)\n # Run function\n csv_file = f_611(json_file, './test.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['z', 'y', 'x'], ['1', '2', '3']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)\n def test_case_3(self):\n # Create json file\n json_file = './testx.json'\n with open(json_file, 'w') as f:\n json.dump({'xxx': 99}, f)\n # Run function\n csv_file = f_611(json_file, './testx.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['xxx'], ['99']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)\n def test_case_4(self):\n # Create json file\n json_file = './testy.json'\n with open(json_file, 'w') as f:\n json.dump({'yyy': 99}, f)\n # Run function\n csv_file = f_611(json_file, './testy.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['yyy'], ['99']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)\n def test_case_5(self):\n # Create json file\n json_file = './testz.json'\n with open(json_file, 'w') as f:\n json.dump({'zzz': 99}, f)\n # Run function\n csv_file = f_611(json_file, './testz.csv')\n # Check file\n self.assertTrue(os.path.exists(csv_file))\n with open(csv_file, 'r') as f:\n reader = csv.reader(f)\n csv_data = list(reader)\n self.assertEqual(csv_data, [['zzz'], ['99']])\n # Remove file\n os.remove(json_file)\n os.remove(csv_file)", "apis": ["json.load", "csv.writer"], "libs": ["json", "csv"], "doc": {"description": ["Convert a JSON file to CSV."], "note": [], "params": ["json_file (str): The path to the JSON file.", "csv_file (str): The path to the CSV file."], "returns": ["csv_file: The function returns the path to the CSV file that was written."], "reqs": ["json", "csv"], "raises": [], "example": [">>> f_611('path_to_json_file.json', 'path_to_csv_file.csv')", "'path_to_csv_file.csv'"]}} +{"task_id": "f_342", "prompt": "import pickle\nimport os\nimport pandas as pd\nimport numpy as np\n\n\ndef f_342(df, file_name=\"save.pkl\"):\n \"\"\"\n Save the provided Pandas DataFrame \"df\" in a pickle file with the given name, read it\n back for validation, and delete the intermediate file.\n\n Parameters:\n df (DataFrame): The pandas DataFrame to be saved.\n file_name (str, optional): Name of the file where the DataFrame will be saved. Defaults to 'save.pkl'.\n\n Returns:\n loaded_df (pd.DataFrame): The loaded DataFrame from the specified file.\n\n Requirements:\n - pickle\n - os\n\n Example:\n >>> np.random.seed(0)\n >>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))\n >>> loaded_df = f_342(df, 'test_file.pkl')\n >>> assert df.equals(loaded_df)\n >>> type(df), type(loaded_df)\n (, )\n >>> df.head(2)\n A B C D\n 0 44 47 64 67\n 1 67 9 83 21\n \"\"\"", "canonical_solution": " with open(file_name, \"wb\") as file:\n pickle.dump(df, file)\n\n with open(file_name, \"rb\") as file:\n loaded_df = pickle.load(file)\n\n os.remove(file_name)\n\n return loaded_df", "test": "import unittest\nimport os\nimport pandas as pd\nimport numpy as np\nimport tempfile\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n def tearDown(self):\n self.temp_dir.cleanup()\n def test_case_1(self):\n # Test with random integers\n df = pd.DataFrame(\n np.random.randint(0, 100, size=(100, 4)), columns=list(\"ABCD\")\n )\n file_path = os.path.join(self.temp_dir.name, \"test.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_2(self):\n # Test with floats\n df = pd.DataFrame(np.random.rand(50, 3), columns=list(\"XYZ\"))\n file_path = os.path.join(self.temp_dir.name, \"floats.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_3(self):\n # Test with strings\n df = pd.DataFrame({\"A\": [\"foo\", \"bar\", \"baz\"], \"B\": [\"qux\", \"quux\", \"corge\"]})\n file_path = os.path.join(self.temp_dir.name, \"strings.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_4(self):\n # Test with empty dataframe\n df = pd.DataFrame()\n file_path = os.path.join(self.temp_dir.name, \"empty.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_5(self):\n # Test with datetime\n df = pd.DataFrame(\n {\"Date\": [datetime(2020, 1, 1), datetime(2020, 1, 2)], \"Value\": [10, 20]}\n )\n file_path = os.path.join(self.temp_dir.name, \"datetime.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_6(self):\n # Test larger dataframe\n df = pd.DataFrame(\n np.random.randint(0, 100, size=(10000, 10)),\n columns=[f\"Col{i}\" for i in range(10)],\n )\n file_path = os.path.join(self.temp_dir.name, \"large.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(df.equals(loaded_df))\n self.assertFalse(os.path.exists(file_path))\n def test_case_7(self):\n # Test single entry dataframe\n df = pd.DataFrame({\"Single\": [42]})\n file_path = os.path.join(self.temp_dir.name, \"test_file_small.pkl\")\n loaded_df = f_342(df, file_path)\n self.assertTrue(\n df.equals(loaded_df), \"Loaded DataFrame does not match the original.\"\n )\n self.assertFalse(os.path.exists(file_path))", "apis": ["os.remove", "pickle.load", "pickle.dump"], "libs": ["os", "pickle"], "doc": {"description": ["Save the provided Pandas DataFrame \"df\" in a pickle file with the given name, read it", "back for validation, and delete the intermediate file."], "note": [], "params": ["df (DataFrame): The pandas DataFrame to be saved.", "file_name (str, optional): Name of the file where the DataFrame will be saved. Defaults to 'save.pkl'."], "returns": ["loaded_df (pd.DataFrame): The loaded DataFrame from the specified file."], "reqs": ["pickle", "os"], "raises": [], "example": [">>> np.random.seed(0)", ">>> df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))", ">>> loaded_df = f_342(df, 'test_file.pkl')", ">>> assert df.equals(loaded_df)", ">>> type(df), type(loaded_df)", "(, )", ">>> df.head(2)", "A B C D", "0 44 47 64 67", "1 67 9 83 21"]}} +{"task_id": "f_898", "prompt": "import csv\nimport numpy as np\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\n\n\ndef f_898(file_path):\n \"\"\"\n This function processes a CSV file containing numeric data representing a population. It randomly\n selects 30 individuals from this population without replacement to form a sample. The function\n calculates the mean and standard deviation of this sample. The means delta degree is 1. It also generates a histogram of the\n sample data and overlays a normal distribution curve on this histogram.\n\n Parameters:\n - file_path (str): A string representing the path to the CSV file. Each line in the file should contain\n a single numeric value representing an individual in the population.\n\n Returns:\n - Tuple (float, float, matplotlib.axes._subplots.Axes): The function returns a tuple containing\n three elements:\n - Sample mean (float): The mean of the sample.\n - Sample standard deviation (float): The standard deviation of the sample, calculated with a\n degrees of freedom (ddof) of 1.\n - Matplotlib subplot (matplotlib.axes._subplots.Axes): An object representing the\n generated histogram plot with the normal distribution curve.\n\n Requirements:\n - csv\n - numpy\n - scipy\n - matplotlib\n\n Notes:\n - The function uses numpy for random sampling and statistical calculations.\n - The matplotlib library is used to plot the histogram and the normal distribution curve.\n - The function includes exception handling for file input/output errors, ensuring that any issues\n with reading the CSV file are properly communicated.\n - The function plots a histogram of the sample using matplotlib, with the number of bins\n determined automatically ('auto').\n\n Example:\n >>> mean, std_dev, ax = f_898('population_data.csv')\n >>> print(mean, std_dev)\n (50.5, 29.011491975882016)\n\n In this example, 'population_data.csv' is a CSV file where each line contains a numeric value. The\n function reads this file, samples 30 values, computes their mean and standard deviation, and plots\n a histogram with a normal distribution curve.\n \"\"\"", "canonical_solution": " try:\n with open(file_path, \"r\", encoding=\"utf-8\") as file:\n reader = csv.reader(file)\n population = [int(row[0]) for row in reader]\n except IOError as exc:\n raise IOError(\n \"Error reading the file. Please check the file path and permissions.\"\n ) from exc\n\n sample = np.random.choice(population, 30, replace=False)\n mean = np.mean(sample)\n std_dev = np.std(sample, ddof=1)\n\n plt.hist(sample, bins=\"auto\", density=True, alpha=0.7, rwidth=0.85)\n xmin, xmax = plt.xlim()\n x = np.linspace(xmin, xmax, 100)\n p = stats.norm.pdf(x, mean, std_dev)\n plt.plot(x, p, \"k\", linewidth=2)\n plt.xlabel(\"Sample Values\")\n plt.ylabel(\"Frequency\")\n plt.title(\"Sample Histogram with Normal Distribution Overlay\")\n ax = plt.gca()\n\n return mean, std_dev, ax", "test": "import unittest\nfrom unittest.mock import patch, mock_open\nimport matplotlib\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_898.\"\"\"\n def setUp(self):\n \"\"\"Set up the test environment.\"\"\"\n matplotlib.use(\"Agg\")\n def test_valid_csv_file(self):\n \"\"\"Test with a valid CSV file.\"\"\"\n mock_data = \"1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n10\\n11\\n12\\n13\\n14\\n15\\n16\\n17\\n18\\n19\\n20\\n21\\n22\\n23\\n24\\n25\\n26\\n27\\n28\\n29\\n30\\n31\"\n with patch(\"builtins.open\", mock_open(read_data=mock_data)):\n mean, std_dev, ax = f_898(\"dummy_path\")\n self.assertIsNotNone(mean)\n self.assertIsNotNone(std_dev)\n def test_empty_csv_file(self):\n \"\"\"Test with an empty CSV file.\"\"\"\n mock_data = \"\"\n with patch(\"builtins.open\", mock_open(read_data=mock_data)), self.assertRaises(\n ValueError\n ):\n f_898(\"dummy_path\")\n def test_non_existent_file(self):\n \"\"\"Test with a non-existent file path.\"\"\"\n with self.assertRaises(IOError):\n f_898(\"non_existent_path.csv\")\n def test_csv_with_non_numeric_data(self):\n \"\"\"Test with a CSV file containing non-numeric data.\"\"\"\n mock_data = \"a\\nb\\nc\\nd\\ne\"\n with patch(\"builtins.open\", mock_open(read_data=mock_data)), self.assertRaises(\n ValueError\n ):\n f_898(\"dummy_path\")\n def test_small_population_size(self):\n \"\"\"Test with a small population size.\"\"\"\n mock_data = \"1\\n2\\n3\\n4\\n5\"\n with patch(\"builtins.open\", mock_open(read_data=mock_data)), self.assertRaises(\n ValueError\n ):\n f_898(\"dummy_path\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.random", "scipy.stats.norm", "matplotlib.pyplot.hist", "numpy.mean", "matplotlib.pyplot.xlim", "scipy.stats.norm.pdf", "csv.reader", "matplotlib.pyplot.ylabel", "numpy.linspace", "matplotlib.pyplot.title", "matplotlib.pyplot.plot", "numpy.random.choice", "numpy.std", "matplotlib.pyplot.gca", "matplotlib.pyplot.xlabel"], "libs": ["csv", "numpy", "matplotlib", "scipy"], "doc": {"description": ["This function processes a CSV file containing numeric data representing a population. It randomly", "selects 30 individuals from this population without replacement to form a sample. The function", "calculates the mean and standard deviation of this sample. The means delta degree is 1. It also generates a histogram of the", "sample data and overlays a normal distribution curve on this histogram.", "Notes:", "- The function uses numpy for random sampling and statistical calculations.", "- The matplotlib library is used to plot the histogram and the normal distribution curve.", "- The function includes exception handling for file input/output errors, ensuring that any issues", "with reading the CSV file are properly communicated.", "- The function plots a histogram of the sample using matplotlib, with the number of bins", "determined automatically ('auto').", "In this example, 'population_data.csv' is a CSV file where each line contains a numeric value. The", "function reads this file, samples 30 values, computes their mean and standard deviation, and plots", "a histogram with a normal distribution curve."], "note": [], "params": ["file_path (str): A string representing the path to the CSV file. Each line in the file should contain", "a single numeric value representing an individual in the population."], "returns": ["Tuple (float, float, matplotlib.axes._subplots.Axes): The function returns a tuple containing", "three elements:", "Sample mean (float): The mean of the sample.", "Sample standard deviation (float): The standard deviation of the sample, calculated with a", "degrees of freedom (ddof) of 1.", "Matplotlib subplot (matplotlib.axes._subplots.Axes): An object representing the", "generated histogram plot with the normal distribution curve."], "reqs": ["csv", "numpy", "scipy", "matplotlib"], "raises": [], "example": [">>> mean, std_dev, ax = f_898('population_data.csv')", ">>> print(mean, std_dev)", "(50.5, 29.011491975882016)"]}} +{"task_id": "f_538", "prompt": "import numpy as np\nimport pandas as pd\nfrom scipy.stats import linregress\n\n\ndef f_538(df):\n \"\"\"\n Analyze the relationship between two variables in a DataFrame.\n The function performs a linear regression on the two variables and adds a 'predicted' column to the DataFrame.\n\n Parameters:\n - df (pandas.DataFrame): The input DataFrame with columns 'var1', 'var2'.\n \n Returns:\n - df (pandas.DataFrame): The DataFrame with the added 'predicted' column.\n\n Requirements:\n - numpy\n - pandas\n - scipy\n\n Example:\n >>> df = pd.DataFrame({'var1': np.random.randn(10),\n ... 'var2': np.random.randn(10)})\n >>> df = f_538(df)\n >>> assert 'predicted' in df.columns\n >>> assert len(df) == 10\n >>> assert len(df.columns) == 3\n \"\"\"", "canonical_solution": " \n regression = linregress(df['var1'], df['var2'])\n \n # Explicit use of np.array to demonstrate the np. prefix usage\n # This step is purely illustrative and may not be necessary for this specific logic\n predictions = np.array(regression.slope) * np.array(df['var1']) + np.array(regression.intercept)\n \n df['predicted'] = pd.Series(predictions, index=df.index)\n\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'var1': np.random.randn(10),\n 'var2': np.random.randn(10)})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 10)\n self.assertEqual(len(df.columns), 3)\n def test_case_2(self):\n df = pd.DataFrame({'var1': [1, 2, 3, 4, 5],\n 'var2': [1, 2, 3, 4, 5]})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 5)\n self.assertEqual(len(df.columns), 3)\n self.assertTrue(np.all(df['predicted'] == df['var2']))\n \n def test_case_3(self):\n df = pd.DataFrame({'var1': [1, 2, 3, 4, 5],\n 'var2': [5, 4, 3, 2, 1]})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 5)\n self.assertEqual(len(df.columns), 3)\n self.assertTrue(np.all(df['predicted'] == df['var2']))\n def test_case_4(self):\n df = pd.DataFrame({'var1': [1, 2, 3, 4, 5],\n 'var2': [1, 1, 1, 1, 1]})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 5)\n self.assertEqual(len(df.columns), 3)\n self.assertTrue(np.all(df['predicted'] == df['var2']))\n def test_case_5(self):\n df = pd.DataFrame({'var1': [0, 1, 2, 3, 4, 5],\n 'var2': [1, 1, 1, 1, 1, 1]})\n df = f_538(df)\n self.assertTrue('predicted' in df.columns)\n self.assertEqual(len(df), 6)\n self.assertEqual(len(df.columns), 3)\n self.assertTrue(np.all(df['predicted'] == df['var2']))", "apis": ["scipy.stats.linregress", "numpy.array", "pandas.Series"], "libs": ["pandas", "numpy", "scipy"], "doc": {"description": ["Analyze the relationship between two variables in a DataFrame.", "The function performs a linear regression on the two variables and adds a 'predicted' column to the DataFrame."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame with columns 'var1', 'var2'."], "returns": ["df (pandas.DataFrame): The DataFrame with the added 'predicted' column."], "reqs": ["numpy", "pandas", "scipy"], "raises": [], "example": [">>> df = pd.DataFrame({'var1': np.random.randn(10),", "... 'var2': np.random.randn(10)})", ">>> df = f_538(df)", ">>> assert 'predicted' in df.columns", ">>> assert len(df) == 10", ">>> assert len(df.columns) == 3"]}} +{"task_id": "f_862", "prompt": "from PIL import Image\nimport codecs\nimport pytesseract\n\n\nIMAGE_PATH = \"image.png\"\n\n\ndef f_862(filename=IMAGE_PATH, from_encoding=\"cp1251\", to_encoding=\"utf8\"):\n \"\"\"\n Opens an image file, extracts text using OCR, and converts the text encoding, with a fallback to image comment processing.\n\n Raises:\n - ValueError: UnicodeDecodeError or LookupError occurs during conversion\n\n Parameters:\n - filename (str): The path to the image file. Defaults to a global variable 'IMAGE_PATH'.\n - from_encoding (str): The original encoding of the extracted text or image comment. Default is 'cp1251'.\n - to_encoding (str): The target encoding for the converted text or comment. Default is 'utf8'.\n\n Returns:\n - comment (str): The text extracted from the image or the image comment, converted to the target encoding.\n If OCR extraction and comment processing both fail, returns an empty string.\n\n Raises:\n - ValueError: If incorrect encodings are provided for the text or comment conversion.\n\n Requirements:\n - codecs\n - PIL\n - pytesseract\n\n Example:\n # Assuming 'image.png' contains the text '\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440' in Russian (encoded in cp1251),\n # and this text is successfully extracted by the OCR.\n >>> text = f_862('image.png', 'cp1251', 'utf8')\n >>> print(text)\n '\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440' # This output is the utf-8 encoded version of the extracted text.\n \"\"\"", "canonical_solution": " with Image.open(filename) as image:\n try:\n extracted_text = pytesseract.image_to_string(image)\n if extracted_text:\n try:\n return extracted_text.encode(from_encoding).decode(to_encoding)\n except (UnicodeDecodeError, LookupError) as exc:\n raise ValueError(\"Incorrect encoding provided.\") from exc\n except Exception:\n # If OCR fails, fall back to processing the image comment\n pass\n\n comment = image.info.get(\"comment\", \"\")\n if isinstance(comment, bytes):\n try:\n return (\n codecs.decode(comment, from_encoding)\n .encode(to_encoding)\n .decode(to_encoding)\n )\n except (UnicodeDecodeError, LookupError) as exc:\n raise ValueError(\"Incorrect encoding provided.\") from exc\n\n return comment", "test": "import unittest\nfrom unittest.mock import patch, Mock\nfrom PIL import Image\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_862 function.\"\"\"\n def setUp(self):\n self.mock_image = Mock()\n self.mock_image.info.get.return_value = b\"Mocked Comment in cp1251\"\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\")\n def test_successful_ocr_extraction_and_encoding(self, mock_ocr, mock_open):\n \"\"\"Test with successful OCR text extraction and encoding conversion.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n mock_ocr.return_value = \"Extracted Text in cp1251\"\n result = f_862(\"dummy_path\", \"cp1251\", \"utf8\")\n self.assertEqual(result, \"Extracted Text in cp1251\")\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\", side_effect=Exception)\n def test_ocr_fails_comment_extraction_succeeds(self, mock_ocr, mock_open):\n \"\"\"Test OCR fails, but comment extraction and encoding conversion succeed.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n # Mocked comment in cp1251 encoding\n self.mock_image.info.get.return_value = \"Mocked Comment in cp1251\".encode(\n \"cp1251\"\n )\n result = f_862(\"dummy_path\", \"cp1251\", \"utf8\")\n # Expected result after converting the mocked comment from cp1251 to utf8\n expected_result = \"Mocked Comment in cp1251\".encode(\"cp1251\").decode(\"utf8\")\n self.assertEqual(result, expected_result)\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\")\n def test_ocr_succeeds_encoding_fails(self, mock_ocr, mock_open):\n \"\"\"Test OCR text extraction succeeds, but encoding conversion fails.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n mock_ocr.return_value = \"Extracted Text in wrong encoding\"\n with self.assertRaises(ValueError):\n f_862(\"dummy_path\", \"invalid_encoding\", \"utf8\")\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\", side_effect=Exception)\n def test_ocr_and_comment_extraction_fail(self, mock_ocr, mock_open):\n \"\"\"Test both OCR and comment extraction fail.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n self.mock_image.info.get.return_value = \"\" # No comment in metadata\n result = f_862(\"dummy_path\")\n self.assertEqual(result, \"\")\n @patch(\"PIL.Image.open\")\n @patch(\"pytesseract.image_to_string\")\n def test_ocr_extraction_succeeds_no_encoding_needed(self, mock_ocr, mock_open):\n \"\"\"Test OCR extraction succeeds, no encoding conversion needed.\"\"\"\n mock_open.return_value.__enter__.return_value = self.mock_image\n mock_ocr.return_value = \"Extracted Text already in utf8\"\n result = f_862(\"dummy_path\", \"utf8\", \"utf8\")\n self.assertEqual(result, \"Extracted Text already in utf8\")", "apis": ["codecs.decode", "PIL.Image.open", "pytesseract.image_to_string"], "libs": ["PIL", "codecs", "pytesseract"], "doc": {"description": ["Opens an image file, extracts text using OCR, and converts the text encoding, with a fallback to image comment processing."], "note": [], "params": ["filename (str): The path to the image file. Defaults to a global variable 'IMAGE_PATH'.", "from_encoding (str): The original encoding of the extracted text or image comment. Default is 'cp1251'.", "to_encoding (str): The target encoding for the converted text or comment. Default is 'utf8'."], "returns": ["comment (str): The text extracted from the image or the image comment, converted to the target encoding.", "If OCR extraction and comment processing both fail, returns an empty string."], "reqs": ["codecs", "PIL", "pytesseract"], "raises": ["ValueError: UnicodeDecodeError or LookupError occurs during conversion", "ValueError: If incorrect encodings are provided for the text or comment conversion."], "example": ["# Assuming 'image.png' contains the text '\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440' in Russian (encoded in cp1251),", "# and this text is successfully extracted by the OCR.", ">>> text = f_862('image.png', 'cp1251', 'utf8')", ">>> print(text)", "'\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440' # This output is the utf-8 encoded version of the extracted text."]}} {"task_id": "f_550", "prompt": "import numpy as np\nfrom scipy.stats import mode\n\ndef f_550(list_of_lists):\n \"\"\"\n Merges a predefined set of lists into a list and finds the mode of the elements in the list.\n\n Parameters:\n - list_of_lists (list): The list to be processed.\n\n Returns:\n - tuple: The mode and count of the mode in the merged list.\n - mode_value (np.array): The value that appears most frequently in the merged array.\n - mode_count (int): The frequency count of the mode_value within the merged array.\n\n Requirements:\n - numpy\n - scipy\n \n Example:\n >>> f_550([[1, 1, 3], [4, 5, 6], [7, 8, 9]])\n (array([1]), array([2]))\n \"\"\"", "canonical_solution": " merged_list = np.array([item for sublist in list_of_lists for item in sublist])\n mode_value, mode_count = mode(merged_list)\n return mode_value, mode_count", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_550([[1, 1, 3], [4, 5, 6], [7, 8, 9]]), (1, 2))\n def test_case_2(self):\n self.assertEqual(f_550([[1, 1, 3], [4, 5, 6], [7, 8, 9], [1, 1, 1]]), (1, 5))\n def test_case_3(self):\n self.assertEqual(f_550([[1, 1, 3], [4, 5, 6], [7, 8, 9], [1, 1, 1], [2, 2, 2]]), (1, 5))\n def test_case_4(self):\n self.assertEqual(f_550([[1, 1, 3], [4, 5, 6], [7, 8, 9], [1, 1, 1], [2, 2, 2], [3, 3, 3]]), (1, 5))\n def test_case_5(self):\n self.assertEqual(f_550([[1, 1, 3], [4, 5, 6], [7, 8, 9], [1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]]), (1, 5))", "apis": ["scipy.stats.mode", "numpy.array"], "libs": ["numpy", "scipy"], "doc": {"description": ["Merges a predefined set of lists into a list and finds the mode of the elements in the list."], "note": [], "params": ["list_of_lists (list): The list to be processed."], "returns": ["tuple: The mode and count of the mode in the merged list.", "mode_value (np.array): The value that appears most frequently in the merged array.", "mode_count (int): The frequency count of the mode_value within the merged array."], "reqs": ["numpy", "scipy"], "raises": [], "example": [">>> f_550([[1, 1, 3], [4, 5, 6], [7, 8, 9]])", "(array([1]), array([2]))"]}} -{"task_id": "f_363", "prompt": "import subprocess\nimport psutil\nimport time\nimport os\n\n\ndef f_363(script_path: str, timeout=10) -> dict:\n \"\"\"\n Executes a given bash script and returns the CPU and memory usage of the script's process.\n\n This function checks whether the script path exists, then it executes it in a subprocess\n and uses psutil to monitor the script's process for CPU and memory usage.\n Note:\n - CPU usage is a cumulative measure of the script process's CPU demand over the execution\n period, not an average across cores.\n - Memory usage is reported as the sum of RSS memory increments.\n The function aggregates these metrics until the script completes or the specified timeout is\n reached. It handles cases where the process becomes a zombie or is not found, and ensures the\n subprocess is terminated if it runs beyond the timeout.\n\n Parameters:\n script_path (str): The path to the bash script to be executed. Path must exist.\n timeout (int, optional): Maximum time (in seconds) the function should wait for the script to complete.\n Defaults to 10 seconds.\n\n Returns:\n dict: A dictionary containing:\n - 'CPU Usage': The accumulated CPU usage in percentage.\n - 'Memory Usage': The accumulated memory usage in bytes.\n\n Requirements:\n - subprocess\n - psutil\n - time\n - os\n \n Examples:\n >>> resources = f_363('/path/to/script.sh')\n >>> resources\n {'CPU Usage': 5.2, 'Memory Usage': 2048}\n \"\"\"", "canonical_solution": " if not os.path.exists(script_path):\n raise FileNotFoundError(f\"'{script_path}' does not exist.\")\n\n # Start the bash script process\n p = subprocess.Popen([\"bash\", script_path])\n pid = p.pid\n\n # Initialize resources\n total_cpu = 0.0\n total_memory = 0\n\n start_time = time.time()\n\n try:\n # Fetch the process using psutil\n process = psutil.Process(pid)\n\n # Continuously fetch the process statistics\n while process.is_running():\n # Get the CPU and memory usage\n cpu_percent = process.cpu_percent(interval=0.05)\n total_cpu += cpu_percent\n total_memory += process.memory_info().rss\n time.sleep(0.05)\n\n # Check for timeout\n if time.time() - start_time > timeout:\n break\n except (psutil.NoSuchProcess, psutil.ZombieProcess):\n pass\n finally:\n if p.poll() is None:\n p.terminate()\n p.wait()\n\n return {\"CPU Usage\": total_cpu, \"Memory Usage\": total_memory}", "test": "import unittest\nimport os\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.temp_path = self.temp_dir.name\n # Create scripts for testing\n self.script_path_1 = os.path.join(self.temp_path, \"script.sh\")\n with open(self.script_path_1, \"w\") as script_file:\n os.chmod(self.script_path_1, 0o755)\n script_file.write(\"#!/bin/bash\\nsleep 5\")\n self.script_path_2 = os.path.join(self.temp_path, \"cpu_script.sh\")\n with open(self.script_path_2, \"w\") as script_file:\n os.chmod(self.script_path_2, 0o755)\n script_file.write(\n \"#!/bin/bash\\nfor i in {1..10000}\\ndo\\n echo $i > /dev/null\\ndone\"\n )\n def tearDown(self):\n self.temp_dir.cleanup()\n def test_case_1(self):\n # Test returned data structure\n resources = f_363(self.script_path_1)\n self.assertIn(\"CPU Usage\", resources)\n self.assertIn(\"Memory Usage\", resources)\n def test_case_2(self):\n # Test returned data type\n resources = f_363(self.script_path_1)\n self.assertIsInstance(resources[\"CPU Usage\"], float)\n self.assertIsInstance(resources[\"Memory Usage\"], int)\n def test_case_3(self):\n # Testing with a non-existent script\n with self.assertRaises(FileNotFoundError):\n f_363(\"non_existent_script.sh\")\n def test_case_4(self):\n # Check if CPU Usage is accumulated correctly\n resources = f_363(self.script_path_2)\n self.assertGreater(resources[\"CPU Usage\"], 0)\n def test_case_5(self):\n # Check if Memory Usage is accumulated correctly\n resources = f_363(self.script_path_2)\n self.assertGreaterEqual(resources[\"Memory Usage\"], 0)\n def test_case_6(self):\n # Test with a script and a high timeout value\n resources = f_363(self.script_path_1, timeout=100)\n self.assertTrue(isinstance(resources, dict))\n def test_case_7(self):\n # Test function behavior with zero timeout\n resources = f_363(self.script_path_1, timeout=0)\n self.assertTrue(isinstance(resources, dict))\n def test_case_8(self):\n # Test with a script that requires input\n script_path = os.path.join(self.temp_path, \"input_script.sh\")\n with open(script_path, \"w\") as script_file:\n os.chmod(script_path, 0o755)\n script_file.write(\"#!/bin/bash\\nread varName\")\n resources = f_363(script_path, timeout=5)\n self.assertTrue(isinstance(resources, dict))\n def test_case_9(self):\n # Test with an invalid script path\n with self.assertRaises(FileNotFoundError):\n f_363(os.path.join(self.temp_path, \"/invalid/path/\\0/script.sh\"))\n def test_case_10(self):\n # Test with a script that terminates early\n script_path = os.path.join(self.temp_path, \"terminate_script.sh\")\n with open(script_path, \"w\") as script_file:\n os.chmod(script_path, 0o755)\n script_file.write(\"#!/bin/bash\\nexit 1\")\n resources = f_363(script_path)\n self.assertTrue(isinstance(resources, dict))", "apis": ["os.path.exists", "subprocess.Popen", "psutil.Process", "psutil.ZombieProcess", "os.path", "time.time", "time.sleep", "psutil.NoSuchProcess"], "libs": ["subprocess", "psutil", "time", "os"], "doc": {"description": ["Executes a given bash script and returns the CPU and memory usage of the script's process.", "This function checks whether the script path exists, then it executes it in a subprocess", "and uses psutil to monitor the script's process for CPU and memory usage."], "note": ["CPU usage is a cumulative measure of the script process's CPU demand over the execution", "period, not an average across cores.", "Memory usage is reported as the sum of RSS memory increments.", "The function aggregates these metrics until the script completes or the specified timeout is", "reached. It handles cases where the process becomes a zombie or is not found, and ensures the", "subprocess is terminated if it runs beyond the timeout."], "params": ["script_path (str): The path to the bash script to be executed. Path must exist.", "timeout (int, optional): Maximum time (in seconds) the function should wait for the script to complete.", "Defaults to 10 seconds."], "returns": ["dict: A dictionary containing:", "'CPU Usage': The accumulated CPU usage in percentage.", "'Memory Usage': The accumulated memory usage in bytes."], "reqs": ["subprocess", "psutil", "time", "os"], "raises": [], "example": ["Examples:", ">>> resources = f_363('/path/to/script.sh')", ">>> resources", "{'CPU Usage': 5.2, 'Memory Usage': 2048}"]}} -{"task_id": "f_856", "prompt": "import requests\nfrom urllib.parse import urljoin\nfrom bs4 import BeautifulSoup\nimport csv\n\n\ndef f_856(\n url: str,\n base_url: str = \"https://www.example.com\",\n csv_file: str = \"scraped_data.csv\",\n) -> int:\n \"\"\"\n This function scrapes a webpage for all hyperlinks and saves them as absolute URLs to a CSV file.\n\n Parameters:\n - url (str): The relative URL of the webpage to scrape.\n - base_url (str, optional): The base URL of the website to prepend to relative links. Defaults to 'https://www.example.com'.\n - csv_file (str, optional): The filename for the CSV file where the links will be saved. Defaults to 'scraped_data.csv'.\n\n Returns:\n - int: The number of unique absolute links scraped from the webpage.\n\n Requirements:\n - requests\n - urllib.parse.urljoin\n - bs4.BeautifulSoup\n - csv\n\n Examples:\n >>> f_856('/mywebpage')\n 5\n >>> f_856('/anotherpage', base_url='https://www.different.com', csv_file='other_links.csv')\n 8\n \"\"\"", "canonical_solution": " full_url = urljoin(base_url, url)\n response = requests.get(full_url)\n soup = BeautifulSoup(response.text, \"html.parser\")\n\n # Extract and convert all found links to absolute URLs\n links = {urljoin(base_url, a[\"href\"]) for a in soup.find_all(\"a\", href=True)}\n\n with open(csv_file, \"w\", newline=\"\", encoding=\"utf-8\") as csvfile:\n writer = csv.writer(csvfile)\n for link in links:\n writer.writerow([link])\n\n return len(links)", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport requests\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_856.\"\"\"\n @patch(\"requests.get\")\n def test_empty_page(self, mock_get):\n \"\"\"\n Test the function with an empty webpage (no links).\n \"\"\"\n mock_get.return_value = MagicMock(text=\"\")\n result = f_856(\"/empty\")\n self.assertEqual(result, 0)\n @patch(\"requests.get\")\n def test_single_link(self, mock_get):\n \"\"\"\n Test the function with a webpage containing a single link.\n \"\"\"\n mock_get.return_value = MagicMock(\n text='Link1'\n )\n result = f_856(\"/single-link\")\n self.assertEqual(result, 1)\n @patch(\"requests.get\")\n def test_multiple_links(self, mock_get):\n \"\"\"\n Test the function with a webpage containing multiple distinct links.\n \"\"\"\n mock_get.return_value = MagicMock(\n text='Link1Link2'\n )\n result = f_856(\"/multiple-links\")\n self.assertEqual(result, 2)\n @patch(\"requests.get\")\n def test_duplicate_links(self, mock_get):\n \"\"\"\n Test the function with a webpage containing duplicate links.\n \"\"\"\n mock_get.return_value = MagicMock(\n text='LinkLink'\n )\n result = f_856(\"/duplicate-links\")\n self.assertEqual(result, 1)\n @patch(\"requests.get\")\n def test_external_links(self, mock_get):\n \"\"\"\n Test the function with a webpage containing external links.\n \"\"\"\n mock_get.return_value = MagicMock(\n text='External Link'\n )\n result = f_856(\"/external-link\")\n self.assertEqual(result, 1)\n @classmethod\n def tearDownClass(cls):\n \"\"\"Remove the database file with retries.\"\"\"\n if os.path.exists(\"scraped_data.csv\"):\n os.remove(\"scraped_data.csv\")", "apis": ["urllib.parse.urljoin", "requests.get", "csv.writer", "bs4.BeautifulSoup"], "libs": ["urllib", "requests", "csv", "bs4"], "doc": {"description": ["This function scrapes a webpage for all hyperlinks and saves them as absolute URLs to a CSV file."], "note": [], "params": ["url (str): The relative URL of the webpage to scrape.", "base_url (str, optional): The base URL of the website to prepend to relative links. Defaults to 'https://www.example.com'.", "csv_file (str, optional): The filename for the CSV file where the links will be saved. Defaults to 'scraped_data.csv'."], "returns": ["int: The number of unique absolute links scraped from the webpage."], "reqs": ["requests", "urllib.parse.urljoin", "bs4.BeautifulSoup", "csv"], "raises": [], "example": ["Examples:", ">>> f_856('/mywebpage')", "5", ">>> f_856('/anotherpage', base_url='https://www.different.com', csv_file='other_links.csv')", "8"]}} -{"task_id": "f_784", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\ndef f_784(start_date: str, periods: int, freq: str, random_seed: int = 0) -> (pd.DataFrame, plt.Axes):\n \"\"\"\n Generates and plots a sales forecast starting from a given date, for a specified number of periods and frequency.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n \n Parameters:\n - start_date (str): Start date for the forecast in 'YYYY-MM-DD' format.\n - periods (int): Number of periods to forecast.\n - freq (str): Frequency of the forecast (e.g., 'WOM-2FRI' for the second Friday of each month, 'M' for monthly).\n - random_seed (int, optional): Seed for the random number generator to ensure reproducibility.\n\n Returns:\n - A tuple containing:\n 1. A DataFrame with columns ['Date', 'Sales'], where 'Date' is the forecast date and 'Sales' are the forecasted sales.\n 2. A matplotlib Axes object for the sales forecast plot.\n\n Examples:\n >>> df, ax = f_784('2021-01-01', 5, 'WOM-2FRI')\n >>> print(df)\n Sales\n Date \n 2021-01-08 272\n 2021-02-12 147\n 2021-03-12 217\n 2021-04-09 292\n 2021-05-14 423\n >>> df, ax = f_784('2022-02-01', 3, 'M', random_seed=42)\n >>> print(df)\n Sales\n Date \n 2022-02-28 202\n 2022-03-31 448\n 2022-04-30 370\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n date_range = pd.date_range(start_date, periods=periods, freq=freq)\n sales_forecast = np.random.randint(100, 500, size=periods)\n forecast_df = pd.DataFrame({'Date': date_range, 'Sales': sales_forecast}).set_index('Date')\n\n fig, ax = plt.subplots()\n forecast_df['Sales'].plot(ax=ax, marker='o')\n ax.set_title('Sales Forecast')\n ax.set_xlabel('Date')\n ax.set_ylabel('Sales')\n ax.grid(True)\n \n return forecast_df, ax", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def setUp(self):\n self.random_seed = 42\n def test_basic_forecast(self):\n df, ax = f_784('2021-01-01', 5, 'WOM-2FRI', self.random_seed)\n self.assertEqual(len(df), 5)\n self.assertTrue(all(df.columns == ['Sales']))\n self.assertEqual(ax.get_title(), 'Sales Forecast')\n def test_monthly_forecast(self):\n df, ax = f_784('2022-01-01', 3, 'M', self.random_seed)\n self.assertEqual(len(df), 3)\n self.assertTrue(all(df.columns == ['Sales']))\n def test_quarterly_forecast(self):\n df, ax = f_784('2020-01-01', 4, 'Q', self.random_seed)\n self.assertEqual(len(df), 4)\n self.assertTrue(all(df.columns == ['Sales']))\n def test_invalid_input(self):\n with self.assertRaises(ValueError):\n f_784('2021-13-01', 5, 'M', self.random_seed)\n def test_negative_periods(self):\n with self.assertRaises(ValueError):\n f_784('2021-01-01', -5, 'M', self.random_seed)", "apis": ["numpy.random.randint", "numpy.random", "matplotlib.pyplot.Axes", "numpy.random.seed", "pandas.date_range", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "pandas"], "doc": {"description": ["Generates and plots a sales forecast starting from a given date, for a specified number of periods and frequency."], "note": [], "params": ["start_date (str): Start date for the forecast in 'YYYY-MM-DD' format.", "periods (int): Number of periods to forecast.", "freq (str): Frequency of the forecast (e.g., 'WOM-2FRI' for the second Friday of each month, 'M' for monthly).", "random_seed (int, optional): Seed for the random number generator to ensure reproducibility."], "returns": ["A tuple containing:", "1. A DataFrame with columns ['Date', 'Sales'], where 'Date' is the forecast date and 'Sales' are the forecasted sales.", "2. A matplotlib Axes object for the sales forecast plot."], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": ["Examples:", ">>> df, ax = f_784('2021-01-01', 5, 'WOM-2FRI')", ">>> print(df)", "Sales", "Date", "2021-01-08 272", "2021-02-12 147", "2021-03-12 217", "2021-04-09 292", "2021-05-14 423", ">>> df, ax = f_784('2022-02-01', 3, 'M', random_seed=42)", ">>> print(df)", "Sales", "Date", "2022-02-28 202", "2022-03-31 448", "2022-04-30 370"]}} -{"task_id": "f_331", "prompt": "import pandas as pd\nimport seaborn as sns\n\n\ndef f_331(data, column=\"c\"):\n \"\"\"\n Removes a column from a given data dictionary and creates a heatmap\n of the correlation matrix of the remaining data. Non-numeric columns are\n excluded from the heatmap. If the data is empty or has no numeric columns,\n the function returns None.\n\n Parameters:\n - data: The input data dictionary.\n - column (str): Name of column to remove. Defaults to \"c\".\n\n Returns:\n - matplotlib.axes._subplots.AxesSubplot or None: The Axes object of the heatmap\n or None if the heatmap is not generated.\n\n Requirements:\n - pandas\n - seaborn\n\n Example:\n >>> f_331({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n \n >>> f_331(pd.DataFrame({'a': [\"foo\", \"bar\"]}))\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n if column in df.columns:\n df = df.drop(columns=column)\n\n df = df.select_dtypes(include=[\"number\"])\n\n if df.empty:\n return None\n\n return sns.heatmap(df.corr())", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nclass TestCases(unittest.TestCase):\n def _assert_heatmap_matches_corr(self, ax, corr):\n # Helper function to assert that the heatmap matches the correlation matrix\n heatmap_data = ax.collections[0].get_array().data\n np.testing.assert_array_almost_equal(\n heatmap_data, corr.values.flatten(), decimal=2\n )\n def test_case_1(self):\n # Input: DataFrame with column \"c\".\n data = {\n \"a\": list(range(10)),\n \"b\": list(range(10)),\n \"c\": list(range(10)),\n }\n df = pd.DataFrame(\n data\n )\n ax = f_331(data)\n # Assert that column \"c\" is not in the heatmap\n self.assertNotIn(\"c\", [col.get_text() for col in ax.get_xticklabels()])\n # Check plotted value correctness\n self._assert_heatmap_matches_corr(ax, df.drop(columns=[\"c\"]).corr())\n def test_case_2(self):\n # Input: DataFrame without column \"c\".\n data = {\"a\": list(range(10)), \"b\": list(range(10))}\n df = pd.DataFrame(data)\n ax = f_331(data)\n # Assert that columns \"a\" and \"b\" are in the heatmap\n self.assertIn(\"a\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertIn(\"b\", [col.get_text() for col in ax.get_xticklabels()])\n # Check plotted value correctness\n self._assert_heatmap_matches_corr(ax, df.corr())\n def test_case_3(self):\n # Input: DataFrame with column \"c\", but we specify another column to remove\n data = {\n \"a\": list(range(10)),\n \"b\": list(range(10)),\n \"c\": list(range(10)),\n }\n df = pd.DataFrame(\n data\n )\n ax = f_331(data, column=\"b\")\n # Assert that column \"b\" is not in the heatmap\n self.assertNotIn(\"b\", [col.get_text() for col in ax.get_xticklabels()])\n # Assert that other columns are in the heatmap\n self.assertIn(\"a\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertIn(\"c\", [col.get_text() for col in ax.get_xticklabels()])\n # Check plotted value correctness\n self._assert_heatmap_matches_corr(ax, df.drop(columns=[\"b\"]).corr())\n def test_case_4(self):\n # Input: DataFrame with non-numeric columns and column \"c\".\n data = {\n \"a\": list(range(4)),\n \"b\": [\"low\", \"medium\", \"high\", \"medium\"],\n \"c\": [\"apple\", \"banana\", \"cherry\", \"dates\"],\n }\n df = pd.DataFrame(\n data\n )\n ax = f_331(data)\n # Assert that only numeric column \"a\" is in the heatmap\n self.assertIn(\"a\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertNotIn(\"b\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertNotIn(\"c\", [col.get_text() for col in ax.get_xticklabels()])\n def test_case_5(self):\n # Input: DataFrame with missing values and column \"c\".\n np.random.seed(0)\n data = {\n \"a\": np.random.choice([1, np.nan], 100),\n \"b\": np.random.choice([2, np.nan], 100),\n \"c\": np.random.choice([3, np.nan], 100),\n }\n df = pd.DataFrame(\n data\n )\n ax = f_331(data)\n # Assert that columns \"a\" and \"b\" are in the heatmap and column \"c\" is not\n self.assertIn(\"a\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertIn(\"b\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertNotIn(\"c\", [col.get_text() for col in ax.get_xticklabels()])\n def test_case_6(self):\n # Input: Empty DataFrame.\n data = {}\n df = pd.DataFrame(data)\n ax = f_331(data)\n # Assert that the function returns None for an empty DataFrame\n self.assertIsNone(ax)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "seaborn.heatmap"], "libs": ["pandas", "seaborn"], "doc": {"description": ["Removes a column from a given data dictionary and creates a heatmap", "of the correlation matrix of the remaining data. Non-numeric columns are", "excluded from the heatmap. If the data is empty or has no numeric columns,", "the function returns None."], "note": [], "params": ["data: The input data dictionary.", "column (str): Name of column to remove. Defaults to \"c\"."], "returns": ["matplotlib.axes._subplots.AxesSubplot or None: The Axes object of the heatmap", "or None if the heatmap is not generated."], "reqs": ["pandas", "seaborn"], "raises": [], "example": [">>> f_331({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})", "", ">>> f_331(pd.DataFrame({'a': [\"foo\", \"bar\"]}))"]}} -{"task_id": "f_564", "prompt": "import itertools\nimport random\n\ndef f_564(t, n):\n \"\"\"\n Generate all combinations from a tuple with length n and return a random combination of length n.\n \n Parameters:\n - t (tuple): The tuple.\n - n (int): The length of the combinations.\n \n Returns:\n - tuple: A combination of the input tuple.\n\n Requirements:\n - itertools\n - random\n \n Example:\n >>> random.seed(42)\n >>> f_564((1, 2, 3, 4), 2)\n (3, 4)\n \"\"\"", "canonical_solution": " combinations = list(itertools.combinations(t, n))\n selected_combination = random.choice(combinations)\n\n return selected_combination", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n combination = f_564((1, 2, 3, 4), 2)\n self.assertTrue(tuple(sorted(combination)) in [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)])\n def test_case_2(self):\n combination = f_564((1, 2, 3, 4), 3)\n self.assertTrue(tuple(sorted(combination)) in [(1, 2, 3), (1, 2, 4), (1, 3, 4), (2, 3, 4)])\n def test_case_3(self):\n combination = f_564((1, 2, 3, 4), 4)\n self.assertTrue(tuple(sorted(combination)) in [(1, 2, 3, 4)])\n def test_case_4(self):\n combination = f_564((1, 2, 3, 4), 1)\n self.assertTrue(tuple(sorted(combination)) in [(1,), (2,), (3,), (4,)])\n def test_case_5(self):\n combination = f_564((1, 2, 3, 4), 0)\n self.assertTrue(tuple(sorted(combination)) in [()])", "apis": ["itertools.combinations", "random.choice"], "libs": ["random", "itertools"], "doc": {"description": ["Generate all combinations from a tuple with length n and return a random combination of length n."], "note": [], "params": ["t (tuple): The tuple.", "n (int): The length of the combinations."], "returns": ["tuple: A combination of the input tuple."], "reqs": ["itertools", "random"], "raises": [], "example": [">>> random.seed(42)", ">>> f_564((1, 2, 3, 4), 2)", "(3, 4)"]}} +{"task_id": "f_363", "prompt": "import subprocess\nimport psutil\nimport time\nimport os\n\n\ndef f_363(script_path: str, timeout=10) -> dict:\n \"\"\"\n Executes a given bash script and returns the CPU and memory usage of the script's process.\n\n This function checks whether the script path exists, then it executes it in a subprocess\n and uses psutil to monitor the script's process for CPU and memory usage.\n Note:\n - CPU usage is a cumulative measure of the script process's CPU demand over the execution\n period, not an average across cores.\n - Memory usage is reported as the sum of RSS memory increments.\n The function aggregates these metrics until the script completes or the specified timeout is\n reached. It handles cases where the process becomes a zombie or is not found, and ensures the\n subprocess is terminated if it runs beyond the timeout.\n\n Parameters:\n script_path (str): The path to the bash script to be executed. Path must exist.\n timeout (int, optional): Maximum time (in seconds) the function should wait for the script to complete.\n Defaults to 10 seconds.\n\n Returns:\n dict: A dictionary containing:\n - 'CPU Usage': The accumulated CPU usage in percentage.\n - 'Memory Usage': The accumulated memory usage in bytes.\n\n Requirements:\n - subprocess\n - psutil\n - time\n - os\n \n Examples:\n >>> resources = f_363('/path/to/script.sh')\n >>> resources\n {'CPU Usage': 5.2, 'Memory Usage': 2048}\n \"\"\"", "canonical_solution": " if not os.path.exists(script_path):\n raise FileNotFoundError(f\"'{script_path}' does not exist.\")\n\n # Start the bash script process\n p = subprocess.Popen([\"bash\", script_path])\n pid = p.pid\n\n # Initialize resources\n total_cpu = 0.0\n total_memory = 0\n\n start_time = time.time()\n\n try:\n # Fetch the process using psutil\n process = psutil.Process(pid)\n\n # Continuously fetch the process statistics\n while process.is_running():\n # Get the CPU and memory usage\n cpu_percent = process.cpu_percent(interval=0.05)\n total_cpu += cpu_percent\n total_memory += process.memory_info().rss\n time.sleep(0.05)\n\n # Check for timeout\n if time.time() - start_time > timeout:\n break\n except (psutil.NoSuchProcess, psutil.ZombieProcess):\n pass\n finally:\n if p.poll() is None:\n p.terminate()\n p.wait()\n\n return {\"CPU Usage\": total_cpu, \"Memory Usage\": total_memory}", "test": "import unittest\nimport os\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.temp_path = self.temp_dir.name\n # Create scripts for testing\n self.script_path_1 = os.path.join(self.temp_path, \"script.sh\")\n with open(self.script_path_1, \"w\") as script_file:\n os.chmod(self.script_path_1, 0o755)\n script_file.write(\"#!/bin/bash\\nsleep 5\")\n self.script_path_2 = os.path.join(self.temp_path, \"cpu_script.sh\")\n with open(self.script_path_2, \"w\") as script_file:\n os.chmod(self.script_path_2, 0o755)\n script_file.write(\n \"#!/bin/bash\\nfor i in {1..10000}\\ndo\\n echo $i > /dev/null\\ndone\"\n )\n def tearDown(self):\n self.temp_dir.cleanup()\n def test_case_1(self):\n # Test returned data structure\n resources = f_363(self.script_path_1)\n self.assertIn(\"CPU Usage\", resources)\n self.assertIn(\"Memory Usage\", resources)\n def test_case_2(self):\n # Test returned data type\n resources = f_363(self.script_path_1)\n self.assertIsInstance(resources[\"CPU Usage\"], float)\n self.assertIsInstance(resources[\"Memory Usage\"], int)\n def test_case_3(self):\n # Testing with a non-existent script\n with self.assertRaises(FileNotFoundError):\n f_363(\"non_existent_script.sh\")\n def test_case_4(self):\n # Check if CPU Usage is accumulated correctly\n resources = f_363(self.script_path_2)\n self.assertGreater(resources[\"CPU Usage\"], 0)\n def test_case_5(self):\n # Check if Memory Usage is accumulated correctly\n resources = f_363(self.script_path_2)\n self.assertGreaterEqual(resources[\"Memory Usage\"], 0)\n def test_case_6(self):\n # Test with a script and a high timeout value\n resources = f_363(self.script_path_1, timeout=100)\n self.assertTrue(isinstance(resources, dict))\n def test_case_7(self):\n # Test function behavior with zero timeout\n resources = f_363(self.script_path_1, timeout=0)\n self.assertTrue(isinstance(resources, dict))\n def test_case_8(self):\n # Test with a script that requires input\n script_path = os.path.join(self.temp_path, \"input_script.sh\")\n with open(script_path, \"w\") as script_file:\n os.chmod(script_path, 0o755)\n script_file.write(\"#!/bin/bash\\nread varName\")\n resources = f_363(script_path, timeout=5)\n self.assertTrue(isinstance(resources, dict))\n def test_case_9(self):\n # Test with an invalid script path\n with self.assertRaises(FileNotFoundError):\n f_363(os.path.join(self.temp_path, \"/invalid/path/\\0/script.sh\"))\n def test_case_10(self):\n # Test with a script that terminates early\n script_path = os.path.join(self.temp_path, \"terminate_script.sh\")\n with open(script_path, \"w\") as script_file:\n os.chmod(script_path, 0o755)\n script_file.write(\"#!/bin/bash\\nexit 1\")\n resources = f_363(script_path)\n self.assertTrue(isinstance(resources, dict))", "apis": ["psutil.Process", "os.path", "psutil.NoSuchProcess", "time.time", "time.sleep", "psutil.ZombieProcess", "os.path.exists", "subprocess.Popen"], "libs": ["subprocess", "os", "time", "psutil"], "doc": {"description": ["Executes a given bash script and returns the CPU and memory usage of the script's process.", "This function checks whether the script path exists, then it executes it in a subprocess", "and uses psutil to monitor the script's process for CPU and memory usage."], "note": ["CPU usage is a cumulative measure of the script process's CPU demand over the execution", "period, not an average across cores.", "Memory usage is reported as the sum of RSS memory increments.", "The function aggregates these metrics until the script completes or the specified timeout is", "reached. It handles cases where the process becomes a zombie or is not found, and ensures the", "subprocess is terminated if it runs beyond the timeout."], "params": ["script_path (str): The path to the bash script to be executed. Path must exist.", "timeout (int, optional): Maximum time (in seconds) the function should wait for the script to complete.", "Defaults to 10 seconds."], "returns": ["dict: A dictionary containing:", "'CPU Usage': The accumulated CPU usage in percentage.", "'Memory Usage': The accumulated memory usage in bytes."], "reqs": ["subprocess", "psutil", "time", "os"], "raises": [], "example": ["Examples:", ">>> resources = f_363('/path/to/script.sh')", ">>> resources", "{'CPU Usage': 5.2, 'Memory Usage': 2048}"]}} +{"task_id": "f_856", "prompt": "import requests\nfrom urllib.parse import urljoin\nfrom bs4 import BeautifulSoup\nimport csv\n\n\ndef f_856(\n url: str,\n base_url: str = \"https://www.example.com\",\n csv_file: str = \"scraped_data.csv\",\n) -> int:\n \"\"\"\n This function scrapes a webpage for all hyperlinks and saves them as absolute URLs to a CSV file.\n\n Parameters:\n - url (str): The relative URL of the webpage to scrape.\n - base_url (str, optional): The base URL of the website to prepend to relative links. Defaults to 'https://www.example.com'.\n - csv_file (str, optional): The filename for the CSV file where the links will be saved. Defaults to 'scraped_data.csv'.\n\n Returns:\n - int: The number of unique absolute links scraped from the webpage.\n\n Requirements:\n - requests\n - urllib.parse.urljoin\n - bs4.BeautifulSoup\n - csv\n\n Examples:\n >>> f_856('/mywebpage')\n 5\n >>> f_856('/anotherpage', base_url='https://www.different.com', csv_file='other_links.csv')\n 8\n \"\"\"", "canonical_solution": " full_url = urljoin(base_url, url)\n response = requests.get(full_url)\n soup = BeautifulSoup(response.text, \"html.parser\")\n\n # Extract and convert all found links to absolute URLs\n links = {urljoin(base_url, a[\"href\"]) for a in soup.find_all(\"a\", href=True)}\n\n with open(csv_file, \"w\", newline=\"\", encoding=\"utf-8\") as csvfile:\n writer = csv.writer(csvfile)\n for link in links:\n writer.writerow([link])\n\n return len(links)", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport requests\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_856.\"\"\"\n @patch(\"requests.get\")\n def test_empty_page(self, mock_get):\n \"\"\"\n Test the function with an empty webpage (no links).\n \"\"\"\n mock_get.return_value = MagicMock(text=\"\")\n result = f_856(\"/empty\")\n self.assertEqual(result, 0)\n @patch(\"requests.get\")\n def test_single_link(self, mock_get):\n \"\"\"\n Test the function with a webpage containing a single link.\n \"\"\"\n mock_get.return_value = MagicMock(\n text='Link1'\n )\n result = f_856(\"/single-link\")\n self.assertEqual(result, 1)\n @patch(\"requests.get\")\n def test_multiple_links(self, mock_get):\n \"\"\"\n Test the function with a webpage containing multiple distinct links.\n \"\"\"\n mock_get.return_value = MagicMock(\n text='Link1Link2'\n )\n result = f_856(\"/multiple-links\")\n self.assertEqual(result, 2)\n @patch(\"requests.get\")\n def test_duplicate_links(self, mock_get):\n \"\"\"\n Test the function with a webpage containing duplicate links.\n \"\"\"\n mock_get.return_value = MagicMock(\n text='LinkLink'\n )\n result = f_856(\"/duplicate-links\")\n self.assertEqual(result, 1)\n @patch(\"requests.get\")\n def test_external_links(self, mock_get):\n \"\"\"\n Test the function with a webpage containing external links.\n \"\"\"\n mock_get.return_value = MagicMock(\n text='External Link'\n )\n result = f_856(\"/external-link\")\n self.assertEqual(result, 1)\n @classmethod\n def tearDownClass(cls):\n \"\"\"Remove the database file with retries.\"\"\"\n if os.path.exists(\"scraped_data.csv\"):\n os.remove(\"scraped_data.csv\")", "apis": ["csv.writer", "urllib.parse.urljoin", "bs4.BeautifulSoup", "requests.get"], "libs": ["bs4", "urllib", "csv", "requests"], "doc": {"description": ["This function scrapes a webpage for all hyperlinks and saves them as absolute URLs to a CSV file."], "note": [], "params": ["url (str): The relative URL of the webpage to scrape.", "base_url (str, optional): The base URL of the website to prepend to relative links. Defaults to 'https://www.example.com'.", "csv_file (str, optional): The filename for the CSV file where the links will be saved. Defaults to 'scraped_data.csv'."], "returns": ["int: The number of unique absolute links scraped from the webpage."], "reqs": ["requests", "urllib.parse.urljoin", "bs4.BeautifulSoup", "csv"], "raises": [], "example": ["Examples:", ">>> f_856('/mywebpage')", "5", ">>> f_856('/anotherpage', base_url='https://www.different.com', csv_file='other_links.csv')", "8"]}} +{"task_id": "f_784", "prompt": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\ndef f_784(start_date: str, periods: int, freq: str, random_seed: int = 0) -> (pd.DataFrame, plt.Axes):\n \"\"\"\n Generates and plots a sales forecast starting from a given date, for a specified number of periods and frequency.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n \n Parameters:\n - start_date (str): Start date for the forecast in 'YYYY-MM-DD' format.\n - periods (int): Number of periods to forecast.\n - freq (str): Frequency of the forecast (e.g., 'WOM-2FRI' for the second Friday of each month, 'M' for monthly).\n - random_seed (int, optional): Seed for the random number generator to ensure reproducibility.\n\n Returns:\n - A tuple containing:\n 1. A DataFrame with columns ['Date', 'Sales'], where 'Date' is the forecast date and 'Sales' are the forecasted sales.\n 2. A matplotlib Axes object for the sales forecast plot.\n\n Examples:\n >>> df, ax = f_784('2021-01-01', 5, 'WOM-2FRI')\n >>> print(df)\n Sales\n Date \n 2021-01-08 272\n 2021-02-12 147\n 2021-03-12 217\n 2021-04-09 292\n 2021-05-14 423\n >>> df, ax = f_784('2022-02-01', 3, 'M', random_seed=42)\n >>> print(df)\n Sales\n Date \n 2022-02-28 202\n 2022-03-31 448\n 2022-04-30 370\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n date_range = pd.date_range(start_date, periods=periods, freq=freq)\n sales_forecast = np.random.randint(100, 500, size=periods)\n forecast_df = pd.DataFrame({'Date': date_range, 'Sales': sales_forecast}).set_index('Date')\n\n fig, ax = plt.subplots()\n forecast_df['Sales'].plot(ax=ax, marker='o')\n ax.set_title('Sales Forecast')\n ax.set_xlabel('Date')\n ax.set_ylabel('Sales')\n ax.grid(True)\n \n return forecast_df, ax", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def setUp(self):\n self.random_seed = 42\n def test_basic_forecast(self):\n df, ax = f_784('2021-01-01', 5, 'WOM-2FRI', self.random_seed)\n self.assertEqual(len(df), 5)\n self.assertTrue(all(df.columns == ['Sales']))\n self.assertEqual(ax.get_title(), 'Sales Forecast')\n def test_monthly_forecast(self):\n df, ax = f_784('2022-01-01', 3, 'M', self.random_seed)\n self.assertEqual(len(df), 3)\n self.assertTrue(all(df.columns == ['Sales']))\n def test_quarterly_forecast(self):\n df, ax = f_784('2020-01-01', 4, 'Q', self.random_seed)\n self.assertEqual(len(df), 4)\n self.assertTrue(all(df.columns == ['Sales']))\n def test_invalid_input(self):\n with self.assertRaises(ValueError):\n f_784('2021-13-01', 5, 'M', self.random_seed)\n def test_negative_periods(self):\n with self.assertRaises(ValueError):\n f_784('2021-01-01', -5, 'M', self.random_seed)", "apis": ["numpy.random.randint", "pandas.DataFrame", "numpy.random", "pandas.date_range", "matplotlib.pyplot.subplots", "numpy.random.seed", "matplotlib.pyplot.Axes"], "libs": ["pandas", "numpy", "matplotlib"], "doc": {"description": ["Generates and plots a sales forecast starting from a given date, for a specified number of periods and frequency."], "note": [], "params": ["start_date (str): Start date for the forecast in 'YYYY-MM-DD' format.", "periods (int): Number of periods to forecast.", "freq (str): Frequency of the forecast (e.g., 'WOM-2FRI' for the second Friday of each month, 'M' for monthly).", "random_seed (int, optional): Seed for the random number generator to ensure reproducibility."], "returns": ["A tuple containing:", "1. A DataFrame with columns ['Date', 'Sales'], where 'Date' is the forecast date and 'Sales' are the forecasted sales.", "2. A matplotlib Axes object for the sales forecast plot."], "reqs": ["pandas", "numpy", "matplotlib.pyplot"], "raises": [], "example": ["Examples:", ">>> df, ax = f_784('2021-01-01', 5, 'WOM-2FRI')", ">>> print(df)", "Sales", "Date", "2021-01-08 272", "2021-02-12 147", "2021-03-12 217", "2021-04-09 292", "2021-05-14 423", ">>> df, ax = f_784('2022-02-01', 3, 'M', random_seed=42)", ">>> print(df)", "Sales", "Date", "2022-02-28 202", "2022-03-31 448", "2022-04-30 370"]}} +{"task_id": "f_331", "prompt": "import pandas as pd\nimport seaborn as sns\n\n\ndef f_331(data, column=\"c\"):\n \"\"\"\n Removes a column from a given data dictionary and creates a heatmap\n of the correlation matrix of the remaining data. Non-numeric columns are\n excluded from the heatmap. If the data is empty or has no numeric columns,\n the function returns None.\n\n Parameters:\n - data: The input data dictionary.\n - column (str): Name of column to remove. Defaults to \"c\".\n\n Returns:\n - matplotlib.axes._subplots.Axes or None: The Axes object of the heatmap\n or None if the heatmap is not generated.\n\n Requirements:\n - pandas\n - seaborn\n\n Example:\n >>> f_331({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n \n >>> f_331(pd.DataFrame({'a': [\"foo\", \"bar\"]}))\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n if column in df.columns:\n df = df.drop(columns=column)\n\n df = df.select_dtypes(include=[\"number\"])\n\n if df.empty:\n return None\n\n return sns.heatmap(df.corr())", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nclass TestCases(unittest.TestCase):\n def _assert_heatmap_matches_corr(self, ax, corr):\n # Helper function to assert that the heatmap matches the correlation matrix\n heatmap_data = ax.collections[0].get_array().data\n np.testing.assert_array_almost_equal(\n heatmap_data, corr.values.flatten(), decimal=2\n )\n def test_case_1(self):\n # Input: DataFrame with column \"c\".\n data = {\n \"a\": list(range(10)),\n \"b\": list(range(10)),\n \"c\": list(range(10)),\n }\n df = pd.DataFrame(\n data\n )\n ax = f_331(data)\n # Assert that column \"c\" is not in the heatmap\n self.assertNotIn(\"c\", [col.get_text() for col in ax.get_xticklabels()])\n # Check plotted value correctness\n self._assert_heatmap_matches_corr(ax, df.drop(columns=[\"c\"]).corr())\n def test_case_2(self):\n # Input: DataFrame without column \"c\".\n data = {\"a\": list(range(10)), \"b\": list(range(10))}\n df = pd.DataFrame(data)\n ax = f_331(data)\n # Assert that columns \"a\" and \"b\" are in the heatmap\n self.assertIn(\"a\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertIn(\"b\", [col.get_text() for col in ax.get_xticklabels()])\n # Check plotted value correctness\n self._assert_heatmap_matches_corr(ax, df.corr())\n def test_case_3(self):\n # Input: DataFrame with column \"c\", but we specify another column to remove\n data = {\n \"a\": list(range(10)),\n \"b\": list(range(10)),\n \"c\": list(range(10)),\n }\n df = pd.DataFrame(\n data\n )\n ax = f_331(data, column=\"b\")\n # Assert that column \"b\" is not in the heatmap\n self.assertNotIn(\"b\", [col.get_text() for col in ax.get_xticklabels()])\n # Assert that other columns are in the heatmap\n self.assertIn(\"a\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertIn(\"c\", [col.get_text() for col in ax.get_xticklabels()])\n # Check plotted value correctness\n self._assert_heatmap_matches_corr(ax, df.drop(columns=[\"b\"]).corr())\n def test_case_4(self):\n # Input: DataFrame with non-numeric columns and column \"c\".\n data = {\n \"a\": list(range(4)),\n \"b\": [\"low\", \"medium\", \"high\", \"medium\"],\n \"c\": [\"apple\", \"banana\", \"cherry\", \"dates\"],\n }\n df = pd.DataFrame(\n data\n )\n ax = f_331(data)\n # Assert that only numeric column \"a\" is in the heatmap\n self.assertIn(\"a\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertNotIn(\"b\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertNotIn(\"c\", [col.get_text() for col in ax.get_xticklabels()])\n def test_case_5(self):\n # Input: DataFrame with missing values and column \"c\".\n np.random.seed(0)\n data = {\n \"a\": np.random.choice([1, np.nan], 100),\n \"b\": np.random.choice([2, np.nan], 100),\n \"c\": np.random.choice([3, np.nan], 100),\n }\n df = pd.DataFrame(\n data\n )\n ax = f_331(data)\n # Assert that columns \"a\" and \"b\" are in the heatmap and column \"c\" is not\n self.assertIn(\"a\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertIn(\"b\", [col.get_text() for col in ax.get_xticklabels()])\n self.assertNotIn(\"c\", [col.get_text() for col in ax.get_xticklabels()])\n def test_case_6(self):\n # Input: Empty DataFrame.\n data = {}\n df = pd.DataFrame(data)\n ax = f_331(data)\n # Assert that the function returns None for an empty DataFrame\n self.assertIsNone(ax)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "seaborn.heatmap"], "libs": ["seaborn", "pandas"], "doc": {"description": ["Removes a column from a given data dictionary and creates a heatmap", "of the correlation matrix of the remaining data. Non-numeric columns are", "excluded from the heatmap. If the data is empty or has no numeric columns,", "the function returns None."], "note": [], "params": ["data: The input data dictionary.", "column (str): Name of column to remove. Defaults to \"c\"."], "returns": ["matplotlib.axes._subplots.Axes or None: The Axes object of the heatmap", "or None if the heatmap is not generated."], "reqs": ["pandas", "seaborn"], "raises": [], "example": [">>> f_331({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})", "", ">>> f_331(pd.DataFrame({'a': [\"foo\", \"bar\"]}))"]}} +{"task_id": "f_564", "prompt": "import itertools\nimport random\n\ndef f_564(t, n):\n \"\"\"\n Generate all combinations from a tuple with length n and return a random combination of length n.\n \n Parameters:\n - t (tuple): The tuple.\n - n (int): The length of the combinations.\n \n Returns:\n - tuple: A combination of the input tuple.\n\n Requirements:\n - itertools\n - random\n \n Example:\n >>> random.seed(42)\n >>> f_564((1, 2, 3, 4), 2)\n (3, 4)\n \"\"\"", "canonical_solution": " combinations = list(itertools.combinations(t, n))\n selected_combination = random.choice(combinations)\n\n return selected_combination", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n combination = f_564((1, 2, 3, 4), 2)\n self.assertTrue(tuple(sorted(combination)) in [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)])\n def test_case_2(self):\n combination = f_564((1, 2, 3, 4), 3)\n self.assertTrue(tuple(sorted(combination)) in [(1, 2, 3), (1, 2, 4), (1, 3, 4), (2, 3, 4)])\n def test_case_3(self):\n combination = f_564((1, 2, 3, 4), 4)\n self.assertTrue(tuple(sorted(combination)) in [(1, 2, 3, 4)])\n def test_case_4(self):\n combination = f_564((1, 2, 3, 4), 1)\n self.assertTrue(tuple(sorted(combination)) in [(1,), (2,), (3,), (4,)])\n def test_case_5(self):\n combination = f_564((1, 2, 3, 4), 0)\n self.assertTrue(tuple(sorted(combination)) in [()])", "apis": ["itertools.combinations", "random.choice"], "libs": ["itertools", "random"], "doc": {"description": ["Generate all combinations from a tuple with length n and return a random combination of length n."], "note": [], "params": ["t (tuple): The tuple.", "n (int): The length of the combinations."], "returns": ["tuple: A combination of the input tuple."], "reqs": ["itertools", "random"], "raises": [], "example": [">>> random.seed(42)", ">>> f_564((1, 2, 3, 4), 2)", "(3, 4)"]}} {"task_id": "f_775", "prompt": "import string\nimport wordninja\n\ndef f_775(word):\n \"\"\"\n Converts a word into a list of tuples, with each tuple containing a lowercase English letter from the word and its position in the alphabet.\n Then, split the given word into a list of words.\n \n Requirements:\n - string\n - wordninja\n \n Parameters:\n - word (str): A string composed of lowercase letters.\n \n Returns:\n - list of tuples: Each tuple consists of a letter from the input string and its corresponding position in the alphabet.\n \n Examples:\n >>> f_775('abc')\n ([('a', 1), ('b', 2), ('c', 3)], ['abc'])\n >>> f_775('howistheweathertoday')\n ([('h', 8), ('o', 15), ('w', 23), ('i', 9), ('s', 19), ('t', 20), ('h', 8), ('e', 5), ('w', 23), ('e', 5), ('a', 1), ('t', 20), ('h', 8), ('e', 5), ('r', 18), ('t', 20), ('o', 15), ('d', 4), ('a', 1), ('y', 25)], ['how', 'is', 'the', 'weather', 'today'])\n \"\"\"", "canonical_solution": " ALPHABET = list(string.ascii_lowercase)\n # Map each letter in the word to its corresponding alphabetical number\n word_numbers = [ALPHABET.index(letter) + 1 for letter in word]\n \n # Combine each letter with its alphabetical number in a tuple\n return [(word[i], word_numbers[i]) for i in range(len(word))], wordninja.split(word)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_basic_word(self):\n self.assertEqual(f_775('abc'), ([('a', 1), ('b', 2), ('c', 3)], ['abc']))\n \n def test_non_consecutive_letters(self):\n self.assertEqual(f_775('ihatehim'), ([('i', 9), ('h', 8), ('a', 1), ('t', 20), ('e', 5), ('h', 8), ('i', 9), ('m', 13)], ['i', 'hate', 'him']))\n \n def test_single_letter(self):\n self.assertEqual(f_775('hellohello'), ([('h', 8), ('e', 5), ('l', 12), ('l', 12), ('o', 15), ('h', 8), ('e', 5), ('l', 12), ('l', 12), ('o', 15)], ['hello', 'hello']))\n \n def test_repeated_letters(self):\n self.assertEqual(f_775('aa'), ([('a', 1), ('a', 1)], ['a', 'a']))\n \n def test_empty_string(self):\n self.assertEqual(f_775(''), ([], []))\n \n def test_long_word(self):\n result = f_775('abcdefghijklmnopqrstuvwxyz')\n ALPHABET = list(string.ascii_lowercase)\n expected = [(letter, index + 1) for index, letter in enumerate(ALPHABET)]\n self.assertEqual(result, (expected, ['abcde', 'fg', 'hi', 'j', 'klm', 'no', 'p', 'qrs', 'tu', 'vw', 'xyz']))\n \n def test_word_with_uppercase_should_fail(self):\n with self.assertRaises(ValueError):\n f_775('aBc')", "apis": ["wordninja.split", "string.ascii_lowercase"], "libs": ["string", "wordninja"], "doc": {"description": ["Converts a word into a list of tuples, with each tuple containing a lowercase English letter from the word and its position in the alphabet.", "Then, split the given word into a list of words."], "note": [], "params": ["word (str): A string composed of lowercase letters."], "returns": ["list of tuples: Each tuple consists of a letter from the input string and its corresponding position in the alphabet."], "reqs": ["string", "wordninja"], "raises": [], "example": ["Examples:", ">>> f_775('abc')", "([('a', 1), ('b', 2), ('c', 3)], ['abc'])", ">>> f_775('howistheweathertoday')", "([('h', 8), ('o', 15), ('w', 23), ('i', 9), ('s', 19), ('t', 20), ('h', 8), ('e', 5), ('w', 23), ('e', 5), ('a', 1), ('t', 20), ('h', 8), ('e', 5), ('r', 18), ('t', 20), ('o', 15), ('d', 4), ('a', 1), ('y', 25)], ['how', 'is', 'the', 'weather', 'today'])"]}} -{"task_id": "f_801", "prompt": "import random\nimport re\n\n\ndef f_801(text, seed=None):\n \"\"\"\n Scramble the letters in each word of a given text, keeping the first and last letters of each word intact.\n\n Parameters:\n text (str): The text to be scrambled.\n seed (int, optional): A seed for the random number generator to ensure reproducible results.\n Defaults to None (not set).\n\n Returns:\n str: The scrambled text.\n\n Requirements:\n - random\n - re\n\n Notes:\n - Words are determined by regex word boundaries.\n - The scrambling only affects words longer than three characters, leaving shorter words unchanged.\n\n Examples:\n >>> f_801('Hello, world!', 0)\n 'Hello, wlrod!'\n >>> f_801(\"Programming is fun, isn't it?\", 42)\n \"Prmiangmrog is fun, isn't it?\"\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n def scramble_word(match):\n word = match.group(0)\n if len(word) > 3:\n middle = list(word[1:-1])\n random.shuffle(middle)\n return word[0] + \"\".join(middle) + word[-1]\n else:\n return word\n\n pattern = r\"\\b\\w+\\b\"\n scrambled_text = re.sub(pattern, scramble_word, text)\n\n return scrambled_text", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with a simple sentence\n input_text = \"Hello world\"\n output_text = f_801(input_text, seed=1)\n self.assertTrue(output_text.startswith(\"H\"))\n self.assertTrue(output_text.endswith(\"d\"))\n self.assertEqual(len(input_text.split()), len(output_text.split()))\n def test_case_2(self):\n # Test with single word\n input_text = \"Programming\"\n output_text = f_801(input_text, seed=2)\n self.assertTrue(output_text.startswith(\"P\"))\n self.assertTrue(output_text.endswith(\"g\"))\n self.assertEqual(len(input_text), len(output_text))\n def test_case_3(self):\n # Test with a sentence having punctuation\n input_text = \"Hello, world!\"\n output_text = f_801(input_text, seed=3)\n self.assertTrue(output_text.startswith(\"H\"))\n self.assertTrue(output_text.endswith(\"!\"))\n self.assertEqual(len(input_text.split()), len(output_text.split()))\n def test_case_4(self):\n # Test with a sentence having numbers\n input_text = \"I have 2 cats\"\n output_text = f_801(input_text, seed=4)\n self.assertTrue(output_text.startswith(\"I\"))\n self.assertTrue(output_text.endswith(\"s\"))\n self.assertTrue(\"2\" in output_text)\n self.assertEqual(len(input_text.split()), len(output_text.split()))\n def test_case_5(self):\n # Test with empty string\n input_text = \"\"\n output_text = f_801(input_text, seed=5)\n self.assertEqual(output_text, \"\")\n def test_case_6(self):\n # Test with words containing digits and special characters\n input_text = \"Python3 is fun!\"\n output_text = f_801(input_text, seed=6)\n self.assertTrue(output_text.startswith(\"P\") and output_text.endswith(\"!\"))\n self.assertIn(\"3\", output_text)\n def test_case_7(self):\n # Test words that are 3 characters long\n input_text = \"Can you see the cat?\"\n output_text = f_801(input_text, seed=8)\n self.assertIn(\"Can\", output_text)\n self.assertIn(\"the\", output_text)\n self.assertIn(\"cat\", output_text)\n def test_case_8(self):\n # Test with a longer paragraph\n input_text = (\n \"This is a longer text to see how the function handles more complex inputs.\"\n )\n output_text = f_801(input_text, seed=9)\n self.assertGreaterEqual(\n len(output_text.split()), 10\n ) # Ensure it's a long input\n def test_case_9(self):\n # Test with non-English characters\n input_text = \"\u041f\u0440\u0438\u0432\u0435\u0442, \u043a\u0430\u043a \u0434\u0435\u043b\u0430?\"\n output_text = f_801(input_text, seed=10)\n self.assertTrue(output_text.startswith(\"\u041f\") and output_text.endswith(\"?\"))\n def test_case_10(self):\n # Test reproducibility with the same seed\n input_text = \"Reproducibility test\"\n output_text1 = f_801(input_text, seed=11)\n output_text2 = f_801(input_text, seed=11)\n self.assertEqual(output_text1, output_text2)", "apis": ["random.shuffle", "re.sub", "random.seed"], "libs": ["re", "random"], "doc": {"description": ["Scramble the letters in each word of a given text, keeping the first and last letters of each word intact.", "Notes:", "- Words are determined by regex word boundaries.", "- The scrambling only affects words longer than three characters, leaving shorter words unchanged."], "note": [], "params": ["text (str): The text to be scrambled.", "seed (int, optional): A seed for the random number generator to ensure reproducible results.", "Defaults to None (not set)."], "returns": ["str: The scrambled text."], "reqs": ["random", "re"], "raises": [], "example": ["Examples:", ">>> f_801('Hello, world!', 0)", "'Hello, wlrod!'", ">>> f_801(\"Programming is fun, isn't it?\", 42)", "\"Prmiangmrog is fun, isn't it?\""]}} +{"task_id": "f_801", "prompt": "import random\nimport re\n\n\ndef f_801(text, seed=None):\n \"\"\"\n Scramble the letters in each word of a given text, keeping the first and last letters of each word intact.\n\n Parameters:\n text (str): The text to be scrambled.\n seed (int, optional): A seed for the random number generator to ensure reproducible results.\n Defaults to None (not set).\n\n Returns:\n str: The scrambled text.\n\n Requirements:\n - random\n - re\n\n Notes:\n - Words are determined by regex word boundaries.\n - The scrambling only affects words longer than three characters, leaving shorter words unchanged.\n\n Examples:\n >>> f_801('Hello, world!', 0)\n 'Hello, wlrod!'\n >>> f_801(\"Programming is fun, isn't it?\", 42)\n \"Prmiangmrog is fun, isn't it?\"\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n def scramble_word(match):\n word = match.group(0)\n if len(word) > 3:\n middle = list(word[1:-1])\n random.shuffle(middle)\n return word[0] + \"\".join(middle) + word[-1]\n else:\n return word\n\n pattern = r\"\\b\\w+\\b\"\n scrambled_text = re.sub(pattern, scramble_word, text)\n\n return scrambled_text", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with a simple sentence\n input_text = \"Hello world\"\n output_text = f_801(input_text, seed=1)\n self.assertTrue(output_text.startswith(\"H\"))\n self.assertTrue(output_text.endswith(\"d\"))\n self.assertEqual(len(input_text.split()), len(output_text.split()))\n def test_case_2(self):\n # Test with single word\n input_text = \"Programming\"\n output_text = f_801(input_text, seed=2)\n self.assertTrue(output_text.startswith(\"P\"))\n self.assertTrue(output_text.endswith(\"g\"))\n self.assertEqual(len(input_text), len(output_text))\n def test_case_3(self):\n # Test with a sentence having punctuation\n input_text = \"Hello, world!\"\n output_text = f_801(input_text, seed=3)\n self.assertTrue(output_text.startswith(\"H\"))\n self.assertTrue(output_text.endswith(\"!\"))\n self.assertEqual(len(input_text.split()), len(output_text.split()))\n def test_case_4(self):\n # Test with a sentence having numbers\n input_text = \"I have 2 cats\"\n output_text = f_801(input_text, seed=4)\n self.assertTrue(output_text.startswith(\"I\"))\n self.assertTrue(output_text.endswith(\"s\"))\n self.assertTrue(\"2\" in output_text)\n self.assertEqual(len(input_text.split()), len(output_text.split()))\n def test_case_5(self):\n # Test with empty string\n input_text = \"\"\n output_text = f_801(input_text, seed=5)\n self.assertEqual(output_text, \"\")\n def test_case_6(self):\n # Test with words containing digits and special characters\n input_text = \"Python3 is fun!\"\n output_text = f_801(input_text, seed=6)\n self.assertTrue(output_text.startswith(\"P\") and output_text.endswith(\"!\"))\n self.assertIn(\"3\", output_text)\n def test_case_7(self):\n # Test words that are 3 characters long\n input_text = \"Can you see the cat?\"\n output_text = f_801(input_text, seed=8)\n self.assertIn(\"Can\", output_text)\n self.assertIn(\"the\", output_text)\n self.assertIn(\"cat\", output_text)\n def test_case_8(self):\n # Test with a longer paragraph\n input_text = (\n \"This is a longer text to see how the function handles more complex inputs.\"\n )\n output_text = f_801(input_text, seed=9)\n self.assertGreaterEqual(\n len(output_text.split()), 10\n ) # Ensure it's a long input\n def test_case_9(self):\n # Test with non-English characters\n input_text = \"\u041f\u0440\u0438\u0432\u0435\u0442, \u043a\u0430\u043a \u0434\u0435\u043b\u0430?\"\n output_text = f_801(input_text, seed=10)\n self.assertTrue(output_text.startswith(\"\u041f\") and output_text.endswith(\"?\"))\n def test_case_10(self):\n # Test reproducibility with the same seed\n input_text = \"Reproducibility test\"\n output_text1 = f_801(input_text, seed=11)\n output_text2 = f_801(input_text, seed=11)\n self.assertEqual(output_text1, output_text2)", "apis": ["re.sub", "random.seed", "random.shuffle"], "libs": ["re", "random"], "doc": {"description": ["Scramble the letters in each word of a given text, keeping the first and last letters of each word intact.", "Notes:", "- Words are determined by regex word boundaries.", "- The scrambling only affects words longer than three characters, leaving shorter words unchanged."], "note": [], "params": ["text (str): The text to be scrambled.", "seed (int, optional): A seed for the random number generator to ensure reproducible results.", "Defaults to None (not set)."], "returns": ["str: The scrambled text."], "reqs": ["random", "re"], "raises": [], "example": ["Examples:", ">>> f_801('Hello, world!', 0)", "'Hello, wlrod!'", ">>> f_801(\"Programming is fun, isn't it?\", 42)", "\"Prmiangmrog is fun, isn't it?\""]}} {"task_id": "f_760", "prompt": "import pandas as pd\nimport re\n\n# Function to replace acronyms in DataFrame\ndef f_760(df, mapping):\n \"\"\"\n Replace all acronyms in a DataFrame with their full words according to a provided dictionary.\n \n Requirements:\n - pandas\n - re\n\n Parameters:\n - df (pd.DataFrame): The input DataFrame where each cell is either a string or non-string type.\n - mapping (dict): A dictionary where keys are acronyms and values are the full words.\n \n Returns:\n - pd.DataFrame: A DataFrame where all acronyms in string cells have been replaced with their full words.\n \n Examples:\n >>> df = pd.DataFrame({'text': ['NASA is great', 'I live in the USA']})\n >>> mapping = {'NASA': 'National Aeronautics and Space Administration', 'USA': 'United States of America'}\n >>> print(f_760(df, mapping))\n text\n 0 National Aeronautics and Space Administration ...\n 1 I live in the United States of America\n \"\"\"", "canonical_solution": " pattern = re.compile(r'\\b[A-Z]+\\b')\n \n def replace_match(match):\n return mapping.get(match.group(0), match.group(0))\n\n df = df.applymap(lambda x: pattern.sub(replace_match, x) if isinstance(x, str) else x)\n\n return df", "test": "import unittest\n# Unit tests for the f_760 function\nclass ReplaceAcronymsTests(unittest.TestCase):\n def test_acronyms_single_column(self):\n df = pd.DataFrame({'text': ['NASA rocks', 'Visit the USA']})\n mapping = {'NASA': 'National Aeronautics and Space Administration', 'USA': 'United States of America'}\n expected = pd.DataFrame({'text': ['National Aeronautics and Space Administration rocks', 'Visit the United States of America']})\n result = f_760(df, mapping)\n pd.testing.assert_frame_equal(result, expected)\n \n def test_acronyms_multiple_columns(self):\n df = pd.DataFrame({'col1': ['NASA exploration'], 'col2': ['Made in USA']})\n mapping = {'NASA': 'National Aeronautics and Space Administration', 'USA': 'United States of America'}\n expected = pd.DataFrame({'col1': ['National Aeronautics and Space Administration exploration'], 'col2': ['Made in United States of America']})\n result = f_760(df, mapping)\n pd.testing.assert_frame_equal(result, expected)\n \n def test_no_acronyms(self):\n df = pd.DataFrame({'text': ['A sunny day', 'A rainy night']})\n mapping = {'NASA': 'National Aeronautics and Space Administration'}\n expected = pd.DataFrame({'text': ['A sunny day', 'A rainy night']})\n result = f_760(df, mapping)\n pd.testing.assert_frame_equal(result, expected)\n \n def test_non_string_types(self):\n df = pd.DataFrame({'text': ['NASA mission', 2020, None]})\n mapping = {'NASA': 'National Aeronautics and Space Administration'}\n expected = pd.DataFrame({'text': ['National Aeronautics and Space Administration mission', 2020, None]})\n result = f_760(df, mapping)\n pd.testing.assert_frame_equal(result, expected)\n \n def test_empty_dataframe(self):\n df = pd.DataFrame({'text': []})\n mapping = {'NASA': 'National Aeronautics and Space Administration'}\n expected = pd.DataFrame({'text': []})\n result = f_760(df, mapping)\n pd.testing.assert_frame_equal(result, expected)", "apis": ["re.compile"], "libs": ["re"], "doc": {"description": ["Replace all acronyms in a DataFrame with their full words according to a provided dictionary."], "note": [], "params": ["df (pd.DataFrame): The input DataFrame where each cell is either a string or non-string type.", "mapping (dict): A dictionary where keys are acronyms and values are the full words."], "returns": ["pd.DataFrame: A DataFrame where all acronyms in string cells have been replaced with their full words."], "reqs": ["pandas", "re"], "raises": [], "example": ["Examples:", ">>> df = pd.DataFrame({'text': ['NASA is great', 'I live in the USA']})", ">>> mapping = {'NASA': 'National Aeronautics and Space Administration', 'USA': 'United States of America'}", ">>> print(f_760(df, mapping))", "text", "0 National Aeronautics and Space Administration ...", "1 I live in the United States of America"]}} -{"task_id": "f_825", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import norm\n\n\ndef f_825(df, column, bins=30, density=True, alpha=0.6, color=\"g\", seed=None):\n \"\"\"\n Plots a histogram for a specified column of a pandas DataFrame and overlays\n it with a fitted normal distribution curve.\n\n Parameters:\n - df (pandas.DataFrame): The input DataFrame.\n - column (str): The column name for which the histogram is plotted.\n - bins (int, optional): Number of bins for the histogram. Defaults to 30.\n - density (bool, optional): If True, the histogram is normalized to form a\n probability density. Defaults to True.\n - alpha (float, optional): Transparency level for the histogram bars.\n Defaults to 0.6.\n - color (str, optional): Color of the histogram bars. Defaults to 'g'.\n - seed (int, optional): Seed for the random number generator.\n Defaults to None (not set).\n\n Returns:\n - matplotlib.axes._axes.Axes: The matplotlib Axes object with the plot.\n\n Requirements:\n - numpy\n - matplotlib\n - scipy\n\n Example:\n >>> np.random.seed(0)\n >>> df = pd.DataFrame({'A': np.random.normal(0, 1, 1000)})\n >>> ax = f_825(df, 'A')\n >>> ax.get_title()\n \"Normal Fit for 'A'\"\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n\n data = df[column]\n mu, std = norm.fit(data)\n\n fig, ax = plt.subplots()\n ax.hist(data, bins=bins, density=density, alpha=alpha, color=color)\n\n xmin, xmax = plt.xlim()\n x = np.linspace(xmin, xmax, 100)\n p = norm.pdf(x, mu, std)\n ax.plot(x, p, \"k\", linewidth=2)\n\n title = f\"Normal Fit for '{column}'\"\n ax.set_title(title)\n ax.set_ylabel(\"Density\")\n ax.set_xlabel(column)\n\n return ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib import colors\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n np.random.seed(42)\n def test_data_correctness(self):\n \"\"\"Tests if the normal distribution parameters accurately represent the data's distribution.\"\"\"\n mean, std_dev = 0, 1\n df = pd.DataFrame({\"F\": np.random.normal(mean, std_dev, 5000)})\n ax = f_825(df, \"F\")\n line = ax.lines[\n 0\n ] # Assuming the normal distribution line is the first line object in the plot\n x_data = line.get_xdata()\n y_data = line.get_ydata()\n # The peak of the normal distribution curve should be at the mean\n estimated_mean = x_data[np.argmax(y_data)]\n self.assertAlmostEqual(\n estimated_mean,\n mean,\n places=1,\n msg=\"The calculated mean does not match the expected mean.\",\n )\n def test_bins_parameter(self):\n \"\"\"Verifies that changing the number of bins affects the plot.\"\"\"\n df = pd.DataFrame({\"B\": np.random.normal(0, 1, 100)})\n ax_default_bins = f_825(df, \"B\")\n ax_more_bins = f_825(df, \"B\", bins=50)\n self.assertNotEqual(\n ax_default_bins.patches,\n ax_more_bins.patches,\n \"Different 'bins' parameters should result in different histograms.\",\n )\n def test_alpha_parameter(self):\n \"\"\"Checks if the alpha parameter correctly sets the transparency.\"\"\"\n df = pd.DataFrame({\"C\": np.random.normal(0, 1, 100)})\n ax = f_825(df, \"C\", alpha=0.1)\n self.assertLess(\n ax.patches[0].get_alpha(),\n 0.5,\n \"The alpha parameter should control the transparency of histogram bars.\",\n )\n def test_density_parameter(self):\n \"\"\"Ensures the density parameter properly normalizes the histogram.\"\"\"\n df = pd.DataFrame({\"D\": np.random.normal(0, 1, 100)})\n ax = f_825(df, \"D\", density=False)\n total_bar_area = sum((p.get_width() * p.get_height() for p in ax.patches))\n self.assertNotEqual(\n total_bar_area,\n 1,\n \"With 'density=False', the histogram should not be normalized to form a probability density.\",\n )\n def test_color_parameter(self):\n \"\"\"Validates that the histogram bars use the specified color.\"\"\"\n df = pd.DataFrame({\"E\": np.random.normal(0, 1, 100)})\n ax = f_825(\n df, \"E\", color=\"blue\", alpha=0.6\n ) # Match alpha value with the function's default or specified value\n for patch in ax.patches:\n self.assertEqual(\n patch.get_facecolor(),\n colors.to_rgba(\"blue\", alpha=0.6),\n \"The bars should match the specified color.\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["scipy.stats.norm.pdf", "numpy.random", "numpy.random.seed", "matplotlib.pyplot.xlim", "scipy.stats.norm.fit", "numpy.linspace", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Plots a histogram for a specified column of a pandas DataFrame and overlays", "it with a fitted normal distribution curve."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame.", "column (str): The column name for which the histogram is plotted.", "bins (int, optional): Number of bins for the histogram. Defaults to 30.", "density (bool, optional): If True, the histogram is normalized to form a", "probability density. Defaults to True.", "alpha (float, optional): Transparency level for the histogram bars.", "Defaults to 0.6.", "color (str, optional): Color of the histogram bars. Defaults to 'g'.", "seed (int, optional): Seed for the random number generator.", "Defaults to None (not set)."], "returns": ["matplotlib.axes._axes.Axes: The matplotlib Axes object with the plot."], "reqs": ["numpy", "matplotlib", "scipy"], "raises": [], "example": [">>> np.random.seed(0)", ">>> df = pd.DataFrame({'A': np.random.normal(0, 1, 1000)})", ">>> ax = f_825(df, 'A')", ">>> ax.get_title()", "\"Normal Fit for 'A'\""]}} +{"task_id": "f_825", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import norm\n\n\ndef f_825(df, column, bins=30, density=True, alpha=0.6, color=\"g\", seed=None):\n \"\"\"\n Plots a histogram for a specified column of a pandas DataFrame and overlays\n it with a fitted normal distribution curve.\n\n Parameters:\n - df (pandas.DataFrame): The input DataFrame.\n - column (str): The column name for which the histogram is plotted.\n - bins (int, optional): Number of bins for the histogram. Defaults to 30.\n - density (bool, optional): If True, the histogram is normalized to form a\n probability density. Defaults to True.\n - alpha (float, optional): Transparency level for the histogram bars.\n Defaults to 0.6.\n - color (str, optional): Color of the histogram bars. Defaults to 'g'.\n - seed (int, optional): Seed for the random number generator.\n Defaults to None (not set).\n\n Returns:\n - matplotlib.axes._axes.Axes: The matplotlib Axes object with the plot.\n\n Requirements:\n - numpy\n - matplotlib\n - scipy\n\n Example:\n >>> np.random.seed(0)\n >>> df = pd.DataFrame({'A': np.random.normal(0, 1, 1000)})\n >>> ax = f_825(df, 'A')\n >>> ax.get_title()\n \"Normal Fit for 'A'\"\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n\n data = df[column]\n mu, std = norm.fit(data)\n\n fig, ax = plt.subplots()\n ax.hist(data, bins=bins, density=density, alpha=alpha, color=color)\n\n xmin, xmax = plt.xlim()\n x = np.linspace(xmin, xmax, 100)\n p = norm.pdf(x, mu, std)\n ax.plot(x, p, \"k\", linewidth=2)\n\n title = f\"Normal Fit for '{column}'\"\n ax.set_title(title)\n ax.set_ylabel(\"Density\")\n ax.set_xlabel(column)\n\n return ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib import colors\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n np.random.seed(42)\n def test_data_correctness(self):\n \"\"\"Tests if the normal distribution parameters accurately represent the data's distribution.\"\"\"\n mean, std_dev = 0, 1\n df = pd.DataFrame({\"F\": np.random.normal(mean, std_dev, 5000)})\n ax = f_825(df, \"F\")\n line = ax.lines[\n 0\n ] # Assuming the normal distribution line is the first line object in the plot\n x_data = line.get_xdata()\n y_data = line.get_ydata()\n # The peak of the normal distribution curve should be at the mean\n estimated_mean = x_data[np.argmax(y_data)]\n self.assertAlmostEqual(\n estimated_mean,\n mean,\n places=1,\n msg=\"The calculated mean does not match the expected mean.\",\n )\n def test_bins_parameter(self):\n \"\"\"Verifies that changing the number of bins affects the plot.\"\"\"\n df = pd.DataFrame({\"B\": np.random.normal(0, 1, 100)})\n ax_default_bins = f_825(df, \"B\")\n ax_more_bins = f_825(df, \"B\", bins=50)\n self.assertNotEqual(\n ax_default_bins.patches,\n ax_more_bins.patches,\n \"Different 'bins' parameters should result in different histograms.\",\n )\n def test_alpha_parameter(self):\n \"\"\"Checks if the alpha parameter correctly sets the transparency.\"\"\"\n df = pd.DataFrame({\"C\": np.random.normal(0, 1, 100)})\n ax = f_825(df, \"C\", alpha=0.1)\n self.assertLess(\n ax.patches[0].get_alpha(),\n 0.5,\n \"The alpha parameter should control the transparency of histogram bars.\",\n )\n def test_density_parameter(self):\n \"\"\"Ensures the density parameter properly normalizes the histogram.\"\"\"\n df = pd.DataFrame({\"D\": np.random.normal(0, 1, 100)})\n ax = f_825(df, \"D\", density=False)\n total_bar_area = sum((p.get_width() * p.get_height() for p in ax.patches))\n self.assertNotEqual(\n total_bar_area,\n 1,\n \"With 'density=False', the histogram should not be normalized to form a probability density.\",\n )\n def test_color_parameter(self):\n \"\"\"Validates that the histogram bars use the specified color.\"\"\"\n df = pd.DataFrame({\"E\": np.random.normal(0, 1, 100)})\n ax = f_825(\n df, \"E\", color=\"blue\", alpha=0.6\n ) # Match alpha value with the function's default or specified value\n for patch in ax.patches:\n self.assertEqual(\n patch.get_facecolor(),\n colors.to_rgba(\"blue\", alpha=0.6),\n \"The bars should match the specified color.\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["scipy.stats.norm.fit", "numpy.random", "matplotlib.pyplot.xlim", "scipy.stats.norm.pdf", "numpy.linspace", "matplotlib.pyplot.subplots", "numpy.random.seed"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Plots a histogram for a specified column of a pandas DataFrame and overlays", "it with a fitted normal distribution curve."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame.", "column (str): The column name for which the histogram is plotted.", "bins (int, optional): Number of bins for the histogram. Defaults to 30.", "density (bool, optional): If True, the histogram is normalized to form a", "probability density. Defaults to True.", "alpha (float, optional): Transparency level for the histogram bars.", "Defaults to 0.6.", "color (str, optional): Color of the histogram bars. Defaults to 'g'.", "seed (int, optional): Seed for the random number generator.", "Defaults to None (not set)."], "returns": ["matplotlib.axes._axes.Axes: The matplotlib Axes object with the plot."], "reqs": ["numpy", "matplotlib", "scipy"], "raises": [], "example": [">>> np.random.seed(0)", ">>> df = pd.DataFrame({'A': np.random.normal(0, 1, 1000)})", ">>> ax = f_825(df, 'A')", ">>> ax.get_title()", "\"Normal Fit for 'A'\""]}} {"task_id": "f_592", "prompt": "import pandas as pd\nimport numpy as np\nfrom scipy import stats\n\ndef f_592(df, column, alpha):\n \"\"\"\n Test the normality of a particular numeric column from a DataFrame with Shapiro-Wilk test, \n including an artificial step to explicitly use np.\n\n Parameters:\n - df (pd.DataFrame): The input DataFrame.\n - column (str): The column name.\n - alpha (float): The significance level.\n\n Returns:\n - bool: True if the column passes the normality test, False otherwise.\n\n Example:\n >>> df = pd.DataFrame({'Value': np.random.normal(0, 1, 1000)})\n >>> print(f_592(df, 'Value', 0.05))\n True\n \"\"\"", "canonical_solution": " # Artificial step to use np.mean for demonstration\n mean_value = np.mean(df[column])\n\n # Adjusting DataFrame for demonstration, this step is artificial\n df[column] = df[column] - mean_value\n\n if column not in df.columns:\n raise ValueError('Column does not exist in DataFrame')\n\n _, p = stats.shapiro(df[column])\n return p > alpha", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'Value': np.random.normal(0, 1, 1000)})\n self.assertTrue(f_592(df, 'Value', 0.05))\n def test_case_2(self):\n df = pd.DataFrame({'Value': np.random.uniform(0, 1, 1000)})\n self.assertFalse(f_592(df, 'Value', 0.05))\n def test_case_3(self):\n df = pd.DataFrame({'Value': np.random.exponential(1, 1000)})\n self.assertFalse(f_592(df, 'Value', 0.05))\n def test_case_4(self):\n df = pd.DataFrame({'Value': np.random.lognormal(0, 1, 1000)})\n self.assertFalse(f_592(df, 'Value', 0.05))\n def test_case_5(self):\n df = pd.DataFrame({'Value': np.random.chisquare(1, 1000)})\n self.assertFalse(f_592(df, 'Value', 0.05))", "apis": ["scipy.stats.shapiro", "numpy.mean"], "libs": ["numpy", "scipy"], "doc": {"description": ["Test the normality of a particular numeric column from a DataFrame with Shapiro-Wilk test,", "including an artificial step to explicitly use np."], "note": [], "params": ["df (pd.DataFrame): The input DataFrame.", "column (str): The column name.", "alpha (float): The significance level."], "returns": ["bool: True if the column passes the normality test, False otherwise."], "reqs": [], "raises": [], "example": [">>> df = pd.DataFrame({'Value': np.random.normal(0, 1, 1000)})", ">>> print(f_592(df, 'Value', 0.05))", "True"]}} -{"task_id": "f_373", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import norm\n\n\ndef f_373(n_samples=1000, mu=0, sigma=1, random_seed=0):\n \"\"\"\n Generates a histogram and a probability density function (PDF) plot for a specified normal distribution.\n\n This function draws n_samples from a normal distribution defined by mean (mu) and standard deviation (sigma),\n plots a histogram of the samples, and overlays the PDF of the normal distribution. The histogram's density\n is normalized, and the PDF is plotted with a red line with linewidth=2.\n\n Parameters:\n - n_samples (int): Number of samples for the histogram. Must be greater than 0. Default is 1000.\n - mu (float): Mean for the normal distribution. Default is 0.\n - sigma (float): Standard deviation for the normal distribution. Must be greater than 0. Default is 1.\n - random_seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n - ax (matplotlib.axes._axes.Axes): Axes object with the histogram and PDF plotted.\n - samples (numpy.ndarray): Generated sample data.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - scipy.stats.norm\n\n Example:\n >>> ax, samples = f_373()\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-5.0, 0, '\u22125'), Text(-4.0, 0, '\u22124'), Text(-3.0, 0, '\u22123'), Text(-2.0, 0, '\u22122'), Text(-1.0, 0, '\u22121'), Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5')]\n \"\"\"", "canonical_solution": " if n_samples <= 0 or sigma <= 0:\n raise ValueError(\"Invalid n_samples or sigma\")\n np.random.seed(random_seed)\n plt.figure()\n samples = np.random.normal(mu, sigma, n_samples)\n _, _, _ = plt.hist(samples, 30, density=True)\n ax = plt.gca()\n ax.plot(\n np.linspace(mu - 4 * sigma, mu + 4 * sigma, 1000),\n norm.pdf(np.linspace(mu - 4 * sigma, mu + 4 * sigma, 1000), mu, sigma),\n linewidth=2,\n color=\"r\",\n )\n return ax, samples", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport numpy as np\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.default_seed = 42\n cls.large_n_samples = 100000\n cls.small_n_samples = 100\n cls.zero_n_samples = 0\n cls.negative_n_samples = -100\n cls.default_mu = 0\n cls.default_sigma = 1\n cls.large_sigma = 5\n cls.small_sigma = 0.2\n cls.zero_sigma = 0\n cls.negative_sigma = -1\n cls.custom_mu = 5\n cls.custom_sigma = 2\n def test_case_1(self):\n # Test data generation correctness\n mu_test = 3\n sigma_test = 2\n n_samples_test = 10000\n random_seed_test = 42\n _, samples = f_373(\n n_samples=n_samples_test,\n mu=mu_test,\n sigma=sigma_test,\n random_seed=random_seed_test,\n )\n # Calculate sample mean and standard deviation\n sample_mean = np.mean(samples)\n sample_std = np.std(samples)\n # Verify sample mean and standard deviation are close to mu and sigma within a tolerance\n self.assertAlmostEqual(\n sample_mean,\n mu_test,\n places=1,\n msg=\"Sample mean does not match expected mean.\",\n )\n self.assertAlmostEqual(\n sample_std,\n sigma_test,\n places=1,\n msg=\"Sample standard deviation does not match expected sigma.\",\n )\n def test_case_2(self):\n # Default parameters\n ax, _ = f_373(random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 30)\n def test_case_3(self):\n # Custom parameters: small number of samples, custom mean and standard deviation\n ax, _ = f_373(\n n_samples=self.small_n_samples,\n mu=self.custom_mu,\n sigma=self.custom_sigma,\n random_seed=self.default_seed,\n )\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 30)\n def test_case_4(self):\n # Large number of samples\n ax, _ = f_373(n_samples=self.large_n_samples, random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(len(ax.patches) >= 30)\n def test_case_5(self):\n # Small number of samples\n ax, _ = f_373(n_samples=self.small_n_samples, random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(len(ax.patches) <= 30)\n def test_case_6(self):\n # Large standard deviation\n ax, _ = f_373(sigma=self.large_sigma, random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 30)\n def test_case_7(self):\n # Small standard deviation\n ax, _ = f_373(sigma=self.small_sigma, random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 30)\n def test_case_8(self):\n # Invalid negative standard deviation\n with self.assertRaises(ValueError):\n f_373(sigma=self.negative_sigma)\n def test_case_9(self):\n # Invalid zero standard deviation\n with self.assertRaises(Exception):\n f_373(sigma=self.zero_sigma)\n def test_case_10(self):\n # Invalid zero samples\n with self.assertRaises(Exception):\n f_373(n_samples=self.zero_n_samples)\n def test_case_11(self):\n # Invalid negative samples\n with self.assertRaises(ValueError):\n f_373(n_samples=self.negative_n_samples)\n def test_case_12(self):\n # Reproducibility with same seed\n ax1, sample1 = f_373(random_seed=self.default_seed)\n ax2, sample2 = f_373(random_seed=self.default_seed)\n self.assertEqual(ax1.patches[0].get_height(), ax2.patches[0].get_height())\n self.assertTrue((sample1 == sample2).all())\n def tearDown(self):\n plt.close(\"all\")", "apis": ["scipy.stats.norm.pdf", "numpy.random", "matplotlib.pyplot.hist", "numpy.random.seed", "matplotlib.pyplot.figure", "numpy.random.normal", "matplotlib.pyplot.gca", "numpy.linspace"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Generates a histogram and a probability density function (PDF) plot for a specified normal distribution.", "This function draws n_samples from a normal distribution defined by mean (mu) and standard deviation (sigma),", "plots a histogram of the samples, and overlays the PDF of the normal distribution. The histogram's density", "is normalized, and the PDF is plotted with a red line with linewidth=2."], "note": [], "params": ["n_samples (int): Number of samples for the histogram. Must be greater than 0. Default is 1000.", "mu (float): Mean for the normal distribution. Default is 0.", "sigma (float): Standard deviation for the normal distribution. Must be greater than 0. Default is 1.", "random_seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["ax (matplotlib.axes._axes.Axes): Axes object with the histogram and PDF plotted.", "samples (numpy.ndarray): Generated sample data."], "reqs": ["numpy", "matplotlib.pyplot", "scipy.stats.norm"], "raises": [], "example": [">>> ax, samples = f_373()", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-5.0, 0, '\u22125'), Text(-4.0, 0, '\u22124'), Text(-3.0, 0, '\u22123'), Text(-2.0, 0, '\u22122'), Text(-1.0, 0, '\u22121'), Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5')]"]}} -{"task_id": "f_391", "prompt": "from datetime import datetime\nimport pytz\nimport re\nfrom faker import Faker\n\n\ndef f_391(epoch_milliseconds, seed=0, timezones=[\"UTC\"]):\n \"\"\"Create a dictionary with a fake event schedule given an event time.\n\n The function converts a given epoch in milliseconds into a datetime object in\n the current system time's timezone. It generates a fake event name using Faker. \n Then, it uses pytz and regex to check if specified timezones are valid (i.e. \n in pytz.all_timezones or can be parsed using regex from UTC\u00b1HH:MM format), ignoring \n invalid ones. If none is valid or if timezones were not specified, it selects UTC; \n otherwise, it randomly selects a valid one using Faker. Finally, the function returns a \n dictionary with the fake event name as key and a list as value, where the list itself \n contains a schedule, i.e. a dictionary with keys 'date', 'time', 'timezone'.\n\n Parameters:\n - epoch_milliseconds (int): Epoch time in milliseconds. If negative, defaults to 0.\n - seed (int, optional): Random seed for Faker's RNG. Defaults to None.\n - timezones (list, optional): A list of timezones to select from.\n If none is valid or if not specified, defaults to ['UTC'].\n\n Returns:\n - A dictionary containing event names as keys and a list of event details as values.\n Event details include the date, time, and timezone of the event.\n\n Requirements:\n - datetime.datetime\n - faker\n - pytz\n - re\n\n Example:\n >>> f_391(1236472051807, seed=42)\n {'Danielle': [{'date': datetime.date(2009, 3, 8), 'time': datetime.time(11, 27, 31, 807000), 'timezone': 'UTC'}]}\n >>> f_391(1609459200000, seed=24, timezones=['UTC', 'UTC+01:00'])\n {'Jennifer': [{'date': datetime.date(2021, 1, 1), 'time': datetime.time(11, 0), 'timezone': 'UTC'}]}\n \"\"\"", "canonical_solution": " Faker.seed(seed)\n\n faker_instance = Faker()\n\n event_datetime = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n\n event_name = faker_instance.unique.first_name()\n\n validated_timezones = []\n utc_offset_regex = r\"^UTC([+-])(0[0-9]|1[0-4]):([0-5][0-9])$\"\n for tz in timezones:\n if (\n (tz == \"UTC\")\n or (re.match(utc_offset_regex, tz))\n or (tz in pytz.all_timezones)\n ):\n validated_timezones.append(tz)\n if not validated_timezones:\n validated_timezones = [\"UTC\"]\n\n timezone = faker_instance.random_element(elements=(validated_timezones))\n\n event_schedule = {\n event_name: [\n {\n \"date\": event_datetime.date(),\n \"time\": event_datetime.time(),\n \"timezone\": timezone,\n }\n ]\n }\n\n return event_schedule", "test": "import unittest\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n TIMEZONES = [\"UTC\", \"UTC+01:00\", \"UTC+02:00\", \"UTC+03:00\", \"UTC+04:00\", \"UTC+05:00\"]\n default_time = 1236472051807\n def check_structure_and_content(self, schedule, epoch_milliseconds):\n event_name = list(schedule.keys())[0]\n event_details = schedule[event_name]\n event_datetime = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n self.assertIsInstance(schedule, dict)\n self.assertEqual(len(schedule), 1)\n self.assertEqual(len(event_details), 1)\n self.assertEqual(event_details[0][\"date\"], event_datetime.date())\n self.assertEqual(event_details[0][\"time\"], event_datetime.time())\n self.assertIn(\n event_details[0][\"timezone\"], self.TIMEZONES\n ) # expected in these tests\n def test_case_1(self):\n # Test defaults\n epoch_milliseconds = self.default_time\n schedule = f_391(epoch_milliseconds)\n self.check_structure_and_content(schedule, epoch_milliseconds)\n self.assertTrue(schedule[list(schedule.keys())[0]][0][\"timezone\"] == \"UTC\")\n def test_case_2(self):\n # Test with a specific known epoch\n epoch_milliseconds = self.default_time\n schedule = f_391(epoch_milliseconds, seed=2, timezones=self.TIMEZONES)\n self.check_structure_and_content(schedule, epoch_milliseconds)\n def test_case_3(self):\n # Test with an invalid timezone list - should default to UTC\n schedule = f_391(self.default_time, seed=3, timezones=[\"INVALID\"])\n self.assertTrue(schedule[list(schedule.keys())[0]][0][\"timezone\"] == \"UTC\")\n schedule = f_391(self.default_time, seed=3, timezones=[\"FOO\", \"BAR\"])\n self.assertTrue(schedule[list(schedule.keys())[0]][0][\"timezone\"] == \"UTC\")\n for valid_tz in self.TIMEZONES:\n schedule = f_391(self.default_time, seed=3, timezones=[\"INVALID\", valid_tz])\n self.assertTrue(\n schedule[list(schedule.keys())[0]][0][\"timezone\"] == valid_tz,\n f'Expected {valid_tz}, got {schedule[list(schedule.keys())[0]][0][\"timezone\"]}',\n )\n def test_case_4(self):\n # Test random seed reproducibility\n schedule1 = f_391(self.default_time, seed=42, timezones=self.TIMEZONES)\n schedule2 = f_391(self.default_time, seed=42, timezones=self.TIMEZONES)\n self.assertEqual(schedule1, schedule2)\n def test_case_6(self):\n # Test handling invalid dates - invalid types\n for invalid in [\"1\", [], None]:\n with self.assertRaises(TypeError):\n f_391(invalid)\n def test_case_7(self):\n # Test handling extremely future dates\n epoch_milliseconds = (\n 4133980800000 # This is a date far in the future (2100-12-31)\n )\n schedule = f_391(epoch_milliseconds, seed=5, timezones=[\"UTC\", \"UTC+05:00\"])\n self.check_structure_and_content(schedule, epoch_milliseconds)\n # No additional asserts required, check_structure_and_content will validate\n def test_case_8(self):\n # Test handling leap year date\n epoch_milliseconds = 1582934400000 # This corresponds to 2020-02-29\n schedule = f_391(\n epoch_milliseconds, seed=6, timezones=[\"UTC\", \"UTC+01:00\", \"UTC+02:00\"]\n )\n self.check_structure_and_content(schedule, epoch_milliseconds)\n # Validate it handles the leap day correctly\n event_date = schedule[list(schedule.keys())[0]][0][\"date\"]\n self.assertTrue(event_date.year == 2020)\n self.assertTrue(event_date.month == 2)\n self.assertTrue(event_date.day == 29)", "apis": ["faker.Faker", "faker.Faker.seed", "pytz.all_timezones", "datetime.datetime.fromtimestamp", "re.match"], "libs": ["re", "datetime", "faker", "pytz"], "doc": {"description": ["Create a dictionary with a fake event schedule given an event time.", "The function converts a given epoch in milliseconds into a datetime object in", "the current system time's timezone. It generates a fake event name using Faker.", "Then, it uses pytz and regex to check if specified timezones are valid (i.e.", "in pytz.all_timezones or can be parsed using regex from UTC\u00b1HH:MM format), ignoring", "invalid ones. If none is valid or if timezones were not specified, it selects UTC;", "otherwise, it randomly selects a valid one using Faker. Finally, the function returns a", "dictionary with the fake event name as key and a list as value, where the list itself", "contains a schedule, i.e. a dictionary with keys 'date', 'time', 'timezone'."], "note": [], "params": ["epoch_milliseconds (int): Epoch time in milliseconds. If negative, defaults to 0.", "seed (int, optional): Random seed for Faker's RNG. Defaults to None.", "timezones (list, optional): A list of timezones to select from.", "If none is valid or if not specified, defaults to ['UTC']."], "returns": ["A dictionary containing event names as keys and a list of event details as values.", "Event details include the date, time, and timezone of the event."], "reqs": ["datetime.datetime", "faker", "pytz", "re"], "raises": [], "example": [">>> f_391(1236472051807, seed=42)", "{'Danielle': [{'date': datetime.date(2009, 3, 8), 'time': datetime.time(11, 27, 31, 807000), 'timezone': 'UTC'}]}", ">>> f_391(1609459200000, seed=24, timezones=['UTC', 'UTC+01:00'])", "{'Jennifer': [{'date': datetime.date(2021, 1, 1), 'time': datetime.time(11, 0), 'timezone': 'UTC'}]}"]}} +{"task_id": "f_373", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import norm\n\n\ndef f_373(n_samples=1000, mu=0, sigma=1, random_seed=0):\n \"\"\"\n Generates a histogram and a probability density function (PDF) plot for a specified normal distribution.\n\n This function draws n_samples from a normal distribution defined by mean (mu) and standard deviation (sigma),\n plots a histogram of the samples, and overlays the PDF of the normal distribution. The histogram's density\n is normalized, and the PDF is plotted with a red line with linewidth=2.\n\n Parameters:\n - n_samples (int): Number of samples for the histogram. Must be greater than 0. Default is 1000.\n - mu (float): Mean for the normal distribution. Default is 0.\n - sigma (float): Standard deviation for the normal distribution. Must be greater than 0. Default is 1.\n - random_seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n - ax (matplotlib.axes._axes.Axes): Axes object with the histogram and PDF plotted.\n - samples (numpy.ndarray): Generated sample data.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - scipy.stats.norm\n\n Example:\n >>> ax, samples = f_373()\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-5.0, 0, '\u22125'), Text(-4.0, 0, '\u22124'), Text(-3.0, 0, '\u22123'), Text(-2.0, 0, '\u22122'), Text(-1.0, 0, '\u22121'), Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5')]\n \"\"\"", "canonical_solution": " if n_samples <= 0 or sigma <= 0:\n raise ValueError(\"Invalid n_samples or sigma\")\n np.random.seed(random_seed)\n plt.figure()\n samples = np.random.normal(mu, sigma, n_samples)\n _, _, _ = plt.hist(samples, 30, density=True)\n ax = plt.gca()\n ax.plot(\n np.linspace(mu - 4 * sigma, mu + 4 * sigma, 1000),\n norm.pdf(np.linspace(mu - 4 * sigma, mu + 4 * sigma, 1000), mu, sigma),\n linewidth=2,\n color=\"r\",\n )\n return ax, samples", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport numpy as np\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.default_seed = 42\n cls.large_n_samples = 100000\n cls.small_n_samples = 100\n cls.zero_n_samples = 0\n cls.negative_n_samples = -100\n cls.default_mu = 0\n cls.default_sigma = 1\n cls.large_sigma = 5\n cls.small_sigma = 0.2\n cls.zero_sigma = 0\n cls.negative_sigma = -1\n cls.custom_mu = 5\n cls.custom_sigma = 2\n def test_case_1(self):\n # Test data generation correctness\n mu_test = 3\n sigma_test = 2\n n_samples_test = 10000\n random_seed_test = 42\n _, samples = f_373(\n n_samples=n_samples_test,\n mu=mu_test,\n sigma=sigma_test,\n random_seed=random_seed_test,\n )\n # Calculate sample mean and standard deviation\n sample_mean = np.mean(samples)\n sample_std = np.std(samples)\n # Verify sample mean and standard deviation are close to mu and sigma within a tolerance\n self.assertAlmostEqual(\n sample_mean,\n mu_test,\n places=1,\n msg=\"Sample mean does not match expected mean.\",\n )\n self.assertAlmostEqual(\n sample_std,\n sigma_test,\n places=1,\n msg=\"Sample standard deviation does not match expected sigma.\",\n )\n def test_case_2(self):\n # Default parameters\n ax, _ = f_373(random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 30)\n def test_case_3(self):\n # Custom parameters: small number of samples, custom mean and standard deviation\n ax, _ = f_373(\n n_samples=self.small_n_samples,\n mu=self.custom_mu,\n sigma=self.custom_sigma,\n random_seed=self.default_seed,\n )\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 30)\n def test_case_4(self):\n # Large number of samples\n ax, _ = f_373(n_samples=self.large_n_samples, random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(len(ax.patches) >= 30)\n def test_case_5(self):\n # Small number of samples\n ax, _ = f_373(n_samples=self.small_n_samples, random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(len(ax.patches) <= 30)\n def test_case_6(self):\n # Large standard deviation\n ax, _ = f_373(sigma=self.large_sigma, random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 30)\n def test_case_7(self):\n # Small standard deviation\n ax, _ = f_373(sigma=self.small_sigma, random_seed=self.default_seed)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 30)\n def test_case_8(self):\n # Invalid negative standard deviation\n with self.assertRaises(ValueError):\n f_373(sigma=self.negative_sigma)\n def test_case_9(self):\n # Invalid zero standard deviation\n with self.assertRaises(Exception):\n f_373(sigma=self.zero_sigma)\n def test_case_10(self):\n # Invalid zero samples\n with self.assertRaises(Exception):\n f_373(n_samples=self.zero_n_samples)\n def test_case_11(self):\n # Invalid negative samples\n with self.assertRaises(ValueError):\n f_373(n_samples=self.negative_n_samples)\n def test_case_12(self):\n # Reproducibility with same seed\n ax1, sample1 = f_373(random_seed=self.default_seed)\n ax2, sample2 = f_373(random_seed=self.default_seed)\n self.assertEqual(ax1.patches[0].get_height(), ax2.patches[0].get_height())\n self.assertTrue((sample1 == sample2).all())\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.figure", "numpy.random", "matplotlib.pyplot.hist", "scipy.stats.norm.pdf", "numpy.linspace", "numpy.random.normal", "numpy.random.seed", "matplotlib.pyplot.gca"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Generates a histogram and a probability density function (PDF) plot for a specified normal distribution.", "This function draws n_samples from a normal distribution defined by mean (mu) and standard deviation (sigma),", "plots a histogram of the samples, and overlays the PDF of the normal distribution. The histogram's density", "is normalized, and the PDF is plotted with a red line with linewidth=2."], "note": [], "params": ["n_samples (int): Number of samples for the histogram. Must be greater than 0. Default is 1000.", "mu (float): Mean for the normal distribution. Default is 0.", "sigma (float): Standard deviation for the normal distribution. Must be greater than 0. Default is 1.", "random_seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["ax (matplotlib.axes._axes.Axes): Axes object with the histogram and PDF plotted.", "samples (numpy.ndarray): Generated sample data."], "reqs": ["numpy", "matplotlib.pyplot", "scipy.stats.norm"], "raises": [], "example": [">>> ax, samples = f_373()", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-5.0, 0, '\u22125'), Text(-4.0, 0, '\u22124'), Text(-3.0, 0, '\u22123'), Text(-2.0, 0, '\u22122'), Text(-1.0, 0, '\u22121'), Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5')]"]}} +{"task_id": "f_391", "prompt": "from datetime import datetime\nimport pytz\nimport re\nfrom faker import Faker\n\n\ndef f_391(epoch_milliseconds, seed=0, timezones=[\"UTC\"]):\n \"\"\"Create a dictionary with a fake event schedule given an event time.\n\n The function converts a given epoch in milliseconds into a datetime object in\n the current system time's timezone. It generates a fake event name using Faker. \n Then, it uses pytz and regex to check if specified timezones are valid (i.e. \n in pytz.all_timezones or can be parsed using regex from UTC\u00b1HH:MM format), ignoring \n invalid ones. If none is valid or if timezones were not specified, it selects UTC; \n otherwise, it randomly selects a valid one using Faker. Finally, the function returns a \n dictionary with the fake event name as key and a list as value, where the list itself \n contains a schedule, i.e. a dictionary with keys 'date', 'time', 'timezone'.\n\n Parameters:\n - epoch_milliseconds (int): Epoch time in milliseconds. If negative, defaults to 0.\n - seed (int, optional): Random seed for Faker's RNG. Defaults to None.\n - timezones (list, optional): A list of timezones to select from.\n If none is valid or if not specified, defaults to ['UTC'].\n\n Returns:\n - A dictionary containing event names as keys and a list of event details as values.\n Event details include the date, time, and timezone of the event.\n\n Requirements:\n - datetime.datetime\n - faker\n - pytz\n - re\n\n Example:\n >>> f_391(1236472051807, seed=42)\n {'Danielle': [{'date': datetime.date(2009, 3, 8), 'time': datetime.time(11, 27, 31, 807000), 'timezone': 'UTC'}]}\n >>> f_391(1609459200000, seed=24, timezones=['UTC', 'UTC+01:00'])\n {'Jennifer': [{'date': datetime.date(2021, 1, 1), 'time': datetime.time(11, 0), 'timezone': 'UTC'}]}\n \"\"\"", "canonical_solution": " Faker.seed(seed)\n\n faker_instance = Faker()\n\n event_datetime = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n\n event_name = faker_instance.unique.first_name()\n\n validated_timezones = []\n utc_offset_regex = r\"^UTC([+-])(0[0-9]|1[0-4]):([0-5][0-9])$\"\n for tz in timezones:\n if (\n (tz == \"UTC\")\n or (re.match(utc_offset_regex, tz))\n or (tz in pytz.all_timezones)\n ):\n validated_timezones.append(tz)\n if not validated_timezones:\n validated_timezones = [\"UTC\"]\n\n timezone = faker_instance.random_element(elements=(validated_timezones))\n\n event_schedule = {\n event_name: [\n {\n \"date\": event_datetime.date(),\n \"time\": event_datetime.time(),\n \"timezone\": timezone,\n }\n ]\n }\n\n return event_schedule", "test": "import unittest\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n TIMEZONES = [\"UTC\", \"UTC+01:00\", \"UTC+02:00\", \"UTC+03:00\", \"UTC+04:00\", \"UTC+05:00\"]\n default_time = 1236472051807\n def check_structure_and_content(self, schedule, epoch_milliseconds):\n event_name = list(schedule.keys())[0]\n event_details = schedule[event_name]\n event_datetime = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n self.assertIsInstance(schedule, dict)\n self.assertEqual(len(schedule), 1)\n self.assertEqual(len(event_details), 1)\n self.assertEqual(event_details[0][\"date\"], event_datetime.date())\n self.assertEqual(event_details[0][\"time\"], event_datetime.time())\n self.assertIn(\n event_details[0][\"timezone\"], self.TIMEZONES\n ) # expected in these tests\n def test_case_1(self):\n # Test defaults\n epoch_milliseconds = self.default_time\n schedule = f_391(epoch_milliseconds)\n self.check_structure_and_content(schedule, epoch_milliseconds)\n self.assertTrue(schedule[list(schedule.keys())[0]][0][\"timezone\"] == \"UTC\")\n def test_case_2(self):\n # Test with a specific known epoch\n epoch_milliseconds = self.default_time\n schedule = f_391(epoch_milliseconds, seed=2, timezones=self.TIMEZONES)\n self.check_structure_and_content(schedule, epoch_milliseconds)\n def test_case_3(self):\n # Test with an invalid timezone list - should default to UTC\n schedule = f_391(self.default_time, seed=3, timezones=[\"INVALID\"])\n self.assertTrue(schedule[list(schedule.keys())[0]][0][\"timezone\"] == \"UTC\")\n schedule = f_391(self.default_time, seed=3, timezones=[\"FOO\", \"BAR\"])\n self.assertTrue(schedule[list(schedule.keys())[0]][0][\"timezone\"] == \"UTC\")\n for valid_tz in self.TIMEZONES:\n schedule = f_391(self.default_time, seed=3, timezones=[\"INVALID\", valid_tz])\n self.assertTrue(\n schedule[list(schedule.keys())[0]][0][\"timezone\"] == valid_tz,\n f'Expected {valid_tz}, got {schedule[list(schedule.keys())[0]][0][\"timezone\"]}',\n )\n def test_case_4(self):\n # Test random seed reproducibility\n schedule1 = f_391(self.default_time, seed=42, timezones=self.TIMEZONES)\n schedule2 = f_391(self.default_time, seed=42, timezones=self.TIMEZONES)\n self.assertEqual(schedule1, schedule2)\n def test_case_6(self):\n # Test handling invalid dates - invalid types\n for invalid in [\"1\", [], None]:\n with self.assertRaises(TypeError):\n f_391(invalid)\n def test_case_7(self):\n # Test handling extremely future dates\n epoch_milliseconds = (\n 4133980800000 # This is a date far in the future (2100-12-31)\n )\n schedule = f_391(epoch_milliseconds, seed=5, timezones=[\"UTC\", \"UTC+05:00\"])\n self.check_structure_and_content(schedule, epoch_milliseconds)\n # No additional asserts required, check_structure_and_content will validate\n def test_case_8(self):\n # Test handling leap year date\n epoch_milliseconds = 1582934400000 # This corresponds to 2020-02-29\n schedule = f_391(\n epoch_milliseconds, seed=6, timezones=[\"UTC\", \"UTC+01:00\", \"UTC+02:00\"]\n )\n self.check_structure_and_content(schedule, epoch_milliseconds)\n # Validate it handles the leap day correctly\n event_date = schedule[list(schedule.keys())[0]][0][\"date\"]\n self.assertTrue(event_date.year == 2020)\n self.assertTrue(event_date.month == 2)\n self.assertTrue(event_date.day == 29)", "apis": ["re.match", "pytz.all_timezones", "faker.Faker.seed", "faker.Faker", "datetime.datetime.fromtimestamp"], "libs": ["re", "faker", "pytz", "datetime"], "doc": {"description": ["Create a dictionary with a fake event schedule given an event time.", "The function converts a given epoch in milliseconds into a datetime object in", "the current system time's timezone. It generates a fake event name using Faker.", "Then, it uses pytz and regex to check if specified timezones are valid (i.e.", "in pytz.all_timezones or can be parsed using regex from UTC\u00b1HH:MM format), ignoring", "invalid ones. If none is valid or if timezones were not specified, it selects UTC;", "otherwise, it randomly selects a valid one using Faker. Finally, the function returns a", "dictionary with the fake event name as key and a list as value, where the list itself", "contains a schedule, i.e. a dictionary with keys 'date', 'time', 'timezone'."], "note": [], "params": ["epoch_milliseconds (int): Epoch time in milliseconds. If negative, defaults to 0.", "seed (int, optional): Random seed for Faker's RNG. Defaults to None.", "timezones (list, optional): A list of timezones to select from.", "If none is valid or if not specified, defaults to ['UTC']."], "returns": ["A dictionary containing event names as keys and a list of event details as values.", "Event details include the date, time, and timezone of the event."], "reqs": ["datetime.datetime", "faker", "pytz", "re"], "raises": [], "example": [">>> f_391(1236472051807, seed=42)", "{'Danielle': [{'date': datetime.date(2009, 3, 8), 'time': datetime.time(11, 27, 31, 807000), 'timezone': 'UTC'}]}", ">>> f_391(1609459200000, seed=24, timezones=['UTC', 'UTC+01:00'])", "{'Jennifer': [{'date': datetime.date(2021, 1, 1), 'time': datetime.time(11, 0), 'timezone': 'UTC'}]}"]}} {"task_id": "f_529", "prompt": "import itertools\nimport math\n\ndef f_529(x):\n \"\"\"\n Find the sub-sequence of a dictionary, x, with the minimum total length, where the keys are letters and the values are their lengths.\n\n Parameters:\n - x (dict): The dictionary of letter lengths.\n\n Returns:\n - list: The subsequence with the minimum total length.\n\n Requirements:\n - itertools\n - math\n\n Example:\n >>> f_529({'a': 1, 'b': 2, 'c': 3})\n ['a']\n >>> f_529({'a': 1, 'b': -2, 'c': -5, 'd': 4})\n ['b', 'c']\n \"\"\"", "canonical_solution": " min_length = math.inf\n min_subseq = []\n\n for r in range(1, len(x) + 1):\n for subseq in itertools.combinations(x.items(), r):\n length = sum(length for letter, length in subseq)\n if length < min_length:\n min_length = length\n min_subseq = [letter for letter, length in subseq]\n\n return min_subseq", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_529({'a': 1, 'b': 2, 'c': 3}), ['a'])\n def test_case_2(self):\n self.assertEqual(sorted(f_529({'a': 1, 'b': -2, 'c': -5, 'd': 4})), sorted(['b', 'c']))\n def test_case_3(self):\n self.assertEqual(f_529({'a': 1, 'b': 2, 'c': 3, 'd': 4}), ['a'])\n def test_case_4(self):\n self.assertEqual(sorted(f_529({'a': -1, 'b': 2, 'c': 3, 'd': 4, 'e': -5})), sorted(['a', 'e']))\n def test_case_5(self):\n self.assertEqual(sorted(f_529({'a': -1, 'b': -2, 'c': -3, 'd': 4, 'e': 5})), sorted(['a', 'b', 'c']))", "apis": ["itertools.combinations", "math.inf"], "libs": ["itertools", "math"], "doc": {"description": ["Find the sub-sequence of a dictionary, x, with the minimum total length, where the keys are letters and the values are their lengths."], "note": [], "params": ["x (dict): The dictionary of letter lengths."], "returns": ["list: The subsequence with the minimum total length."], "reqs": ["itertools", "math"], "raises": [], "example": [">>> f_529({'a': 1, 'b': 2, 'c': 3})", "['a']", ">>> f_529({'a': 1, 'b': -2, 'c': -5, 'd': 4})", "['b', 'c']"]}} -{"task_id": "f_340", "prompt": "import pandas as pd\nfrom random import randint\n\n\ndef f_340(name: str, age: int, code: str, salary: float, bio: str) -> pd.DataFrame:\n \"\"\"\n Generate a Pandas DataFrame of employees with their details based on the input provided.\n\n Parameters:\n - name (str): Name of the employee. This is case-sensitive. Must be one of the predefined\n names: 'John', 'Alice', 'Bob', 'Charlie', 'David', otherwise the function raises\n ValueError.\n - age (int): Age of the employee.\n - code (str): Code of the employee.\n - salary (float): Salary of the employee.\n - bio (str): Biography of the employee.\n\n Returns:\n data_df (pd.DataFrame): dataframe with columns: 'Name', 'Age', 'Code', 'Salary', 'Bio', 'Job Title'.\n The 'Job Title' is randomly assigned from the predefined job titles:\n 'Engineer', 'Manager', 'Analyst', 'Developer', 'Tester'.\n\n Requirements:\n - pandas\n - random.randint\n\n Example:\n >>> random.seed(0)\n >>> df = f_340(\"John\", 30, \"A10B\", 5000.0, \"This is a bio with spaces\")\n >>> print(df)\n Name Age Code Salary Bio Job Title\n 0 John 30 A10B 5000.0 This is a bio with spaces Developer\n \"\"\"", "canonical_solution": " EMPLOYEES = [\"John\", \"Alice\", \"Bob\", \"Charlie\", \"David\"]\n JOBS = [\"Engineer\", \"Manager\", \"Analyst\", \"Developer\", \"Tester\"]\n\n if name not in EMPLOYEES:\n raise ValueError(f\"Invalid employee name. Must be one of {EMPLOYEES}\")\n\n job = JOBS[randint(0, len(JOBS) - 1)]\n data_df = pd.DataFrame(\n [[name, age, code, salary, bio, job]],\n columns=[\"Name\", \"Age\", \"Code\", \"Salary\", \"Bio\", \"Job Title\"],\n )\n return data_df", "test": "import unittest\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test the DataFrame structure for a known input\n df = f_340(\"John\", 30, \"A10B\", 5000.0, \"Sample bio\")\n expected_columns = [\"Name\", \"Age\", \"Code\", \"Salary\", \"Bio\", \"Job Title\"]\n self.assertListEqual(\n list(df.columns), expected_columns, \"DataFrame columns mismatch\"\n )\n for col, dtype in zip(\n df.columns, [\"object\", \"int64\", \"object\", \"float64\", \"object\", \"object\"]\n ):\n self.assertTrue(\n df[col].dtype == dtype,\n f\"Column {col} has incorrect type {df[col].dtype}\",\n )\n def test_case_2(self):\n # Test minimum and maximum valid ages and salary, including edge cases\n df_min_age = f_340(\"Alice\", 18, \"X10Y\", 0.0, \"Minimum age and salary\")\n self.assertEqual(df_min_age[\"Age\"][0], 18)\n self.assertEqual(df_min_age[\"Salary\"][0], 0.0)\n df_max_age = f_340(\"Bob\", 65, \"Z99W\", 1000000.0, \"Maximum age and high salary\")\n self.assertEqual(df_max_age[\"Age\"][0], 65)\n self.assertEqual(df_max_age[\"Salary\"][0], 1000000.0)\n def test_case_3(self):\n # Test bio with special characters, very long string, and empty string\n df_special_bio = f_340(\"Charlie\", 30, \"C30D\", 5300.0, \"!@#$%^&*()_+|\")\n self.assertEqual(df_special_bio[\"Bio\"][0], \"!@#$%^&*()_+|\")\n df_long_bio = f_340(\"David\", 30, \"D40E\", 5400.5, \"a\" * 1000)\n self.assertEqual(len(df_long_bio[\"Bio\"][0]), 1000)\n df_empty_bio = f_340(\"John\", 30, \"E50F\", 5500.0, \"\")\n self.assertEqual(df_empty_bio[\"Bio\"][0], \"\")\n def test_case_4(self):\n # Test code with different formats\n df_code_special_chars = f_340(\n \"Alice\", 25, \"!@#$\", 5500.5, \"Bio with special char code\"\n )\n self.assertEqual(df_code_special_chars[\"Code\"][0], \"!@#$\")\n def test_case_5(self):\n # Test for case sensitivity\n with self.assertRaises(ValueError):\n f_340(\"john\", 30, \"J01K\", 5000.0, \"Case sensitive name test\")\n def test_case_6(self):\n # Test each predefined name\n for name in [\"John\", \"Alice\", \"Bob\", \"Charlie\", \"David\"]:\n df = f_340(name, 30, \"A10B\", 5000.0, f\"{name}'s bio\")\n self.assertEqual(\n df[\"Name\"][0], name, f\"Valid name {name} failed to create a DataFrame\"\n )\n def test_case_7(self):\n # Test randomness in job assignment\n job_titles_first_run = []\n job_titles_second_run = []\n job_titles_third_run = []\n n_iter = 15\n name, age, code, salary, bio = (\n \"Bob\",\n 30,\n \"B20C\",\n 5000.0,\n \"Testing randomness in job titles\",\n )\n random.seed(42) # Set the seed for the first run\n for _ in range(n_iter):\n df = f_340(name, age, code, salary, bio)\n job_titles_first_run.append(df[\"Job Title\"][0])\n random.seed(42) # Reset the seed to ensure reproducibility for the second run\n for _ in range(n_iter):\n df = f_340(name, age, code, salary, bio)\n job_titles_second_run.append(df[\"Job Title\"][0])\n random.seed(0) # Repeat for third run with different seed\n for _ in range(n_iter):\n df = f_340(name, age, code, salary, bio)\n job_titles_third_run.append(df[\"Job Title\"][0])\n self.assertEqual(job_titles_first_run, job_titles_second_run)\n self.assertNotEqual(job_titles_first_run, job_titles_third_run)\n def test_case_8(self):\n # Test invalid name\n with self.assertRaises(ValueError):\n f_340(\"InvalidName\", 28, \"C30D\", 5300.0, \"Bio of InvalidName\")", "apis": ["random.randint", "pandas.DataFrame"], "libs": ["pandas", "random"], "doc": {"description": ["Generate a Pandas DataFrame of employees with their details based on the input provided."], "note": [], "params": ["name (str): Name of the employee. This is case-sensitive. Must be one of the predefined", "names: 'John', 'Alice', 'Bob', 'Charlie', 'David', otherwise the function raises", "ValueError.", "age (int): Age of the employee.", "code (str): Code of the employee.", "salary (float): Salary of the employee.", "bio (str): Biography of the employee."], "returns": ["data_df (pd.DataFrame): dataframe with columns: 'Name', 'Age', 'Code', 'Salary', 'Bio', 'Job Title'.", "The 'Job Title' is randomly assigned from the predefined job titles:", "'Engineer', 'Manager', 'Analyst', 'Developer', 'Tester'."], "reqs": ["pandas", "random.randint"], "raises": [], "example": [">>> random.seed(0)", ">>> df = f_340(\"John\", 30, \"A10B\", 5000.0, \"This is a bio with spaces\")", ">>> print(df)", "Name Age Code Salary Bio Job Title", "0 John 30 A10B 5000.0 This is a bio with spaces Developer"]}} -{"task_id": "f_360", "prompt": "import json\nimport re\nimport pandas as pd\n\n\ndef f_360(json_str):\n \"\"\"\n Load a JSON string into a dictionary, normalize the dictionary by doubling the numerical values,\n and then create a Pandas DataFrame from the dictionary.\n\n This function processes a JSON string by converting it into a dictionary, normalizes the data\n by doubling the numerical values, and then constructs a Pandas DataFrame from this dictionary.\n Note: the function is designed to handle simple flat dictionaries, with values that are either\n single numerical values, lists of numerical values, or strings that can be interpreted as\n numbers. It doubles the values of numerical data types within the dictionary, including those\n within lists and those in strings (which are extracted using regex), but the function does not\n process nested dictionaries. Finally, it returns the DataFrame with numerical values stored as\n floats and other types left as-is, or an empty DataFrame if the input JSON string is empty or\n does not contain any valid data structures for DataFrame conversion.\n\n Parameters:\n json_str (str): The JSON string.\n\n Returns:\n DataFrame: A pandas DataFrame created from the dictionary.\n\n Requirements:\n - pandas\n - json\n - re\n\n Example:\n >>> json_str = '{\"a\": [1, 2, 3], \"b\": 4.9, \"c\": \"5\"}'\n >>> df = f_360(json_str)\n >>> type(df)\n \n >>> print(df)\n a b c\n 0 2 9.8 10\n 1 4 9.8 10\n 2 6 9.8 10\n \"\"\"", "canonical_solution": " NUMBERS = re.compile(r\"^-?\\d+(?:\\.\\d+)?$\")\n\n my_dict = json.loads(json_str)\n\n if not my_dict:\n return pd.DataFrame()\n\n for key, value in my_dict.items():\n if isinstance(value, list):\n my_dict[key] = [v * 2 if isinstance(v, (int, float)) else v for v in value]\n elif isinstance(value, (int, float)):\n my_dict[key] = value * 2\n elif isinstance(value, str) and NUMBERS.match(value):\n try:\n my_dict[key] = int(value) * 2\n except ValueError:\n my_dict[key] = float(value) * 2\n\n if all(not isinstance(v, list) for v in my_dict.values()):\n df = pd.DataFrame([my_dict])\n else:\n df = pd.DataFrame(my_dict)\n\n for col in df.columns:\n converted_col = pd.to_numeric(df[col], errors=\"coerce\")\n if not converted_col.isnull().any():\n df[col] = converted_col\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n json_str = '{\"a\": [1, 2, 3], \"b\": 4.9, \"c\": \"5\"}'\n expected_output = pd.DataFrame(\n {\"a\": [2, 4, 6], \"b\": [9.8, 9.8, 9.8], \"c\": [10, 10, 10]}\n )\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)\n def test_case_2(self):\n json_str = \"{}\"\n expected_output = pd.DataFrame()\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)\n def test_case_3(self):\n json_str = '{\"a\": [1, \"apple\", 3], \"b\": 4.9, \"c\": \"5\", \"d\": \"banana\"}'\n expected_output = pd.DataFrame(\n {\n \"a\": [2, \"apple\", 6],\n \"b\": [9.8, 9.8, 9.8],\n \"c\": [10, 10, 10],\n \"d\": [\"banana\", \"banana\", \"banana\"],\n }\n )\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)\n def test_case_4(self):\n json_str = '{\"a\": \"1\", \"b\": \"2.5\", \"c\": \"string\"}'\n expected_output = pd.DataFrame({\"a\": [2], \"b\": [5.0], \"c\": [\"string\"]})\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)\n def test_case_5(self):\n json_str = '{\"a\": [1, 2, {\"b\": 3}], \"c\": 4.9}'\n expected_output = pd.DataFrame({\"a\": [2, 4, {\"b\": 3}], \"c\": [9.8, 9.8, 9.8]})\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)", "apis": ["pandas.to_numeric", "json.loads", "pandas.DataFrame", "re.compile"], "libs": ["pandas", "json", "re"], "doc": {"description": ["Load a JSON string into a dictionary, normalize the dictionary by doubling the numerical values,", "and then create a Pandas DataFrame from the dictionary.", "This function processes a JSON string by converting it into a dictionary, normalizes the data", "by doubling the numerical values, and then constructs a Pandas DataFrame from this dictionary."], "note": ["the function is designed to handle simple flat dictionaries, with values that are either", "single numerical values, lists of numerical values, or strings that can be interpreted as", "numbers. It doubles the values of numerical data types within the dictionary, including those", "within lists and those in strings (which are extracted using regex), but the function does not", "process nested dictionaries. Finally, it returns the DataFrame with numerical values stored as", "floats and other types left as-is, or an empty DataFrame if the input JSON string is empty or", "does not contain any valid data structures for DataFrame conversion."], "params": ["json_str (str): The JSON string."], "returns": ["DataFrame: A pandas DataFrame created from the dictionary."], "reqs": ["pandas", "json", "re"], "raises": [], "example": [">>> json_str = '{\"a\": [1, 2, 3], \"b\": 4.9, \"c\": \"5\"}'", ">>> df = f_360(json_str)", ">>> type(df)", "", ">>> print(df)", "a b c", "0 2 9.8 10", "1 4 9.8 10", "2 6 9.8 10"]}} -{"task_id": "f_411", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_411(data):\n \"\"\"\n Combine a list of dictionaries with the same keys into a single dictionary, turn it into a\n Pandas DataFrame and create a line plot of the data.\n\n Parameters:\n data (list): A list of dictionaries. The keys are labels and the values are data points.\n\n Returns:\n matplotlib.axes._subplots.AxesSubplot or None: Axes object of the plot showing 'Data over Time',\n with 'Time' on the x-axis and 'Data Points' on the y-axis.\n If data is empty, return None.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_411([{'A': 10, 'B': 15, 'C': 12},\\\n {'A': 12, 'B': 20, 'C': 14},\\\n {'A': 15, 'B': 18, 'C': 15},\\\n {'A': 11, 'B': 17, 'C': 13}])\n >>> type(ax)\n \n >>> ax.get_title()\n 'Data over Time'\n >>> len(ax.lines)\n 3\n \"\"\"", "canonical_solution": " if not data:\n return None\n df = pd.DataFrame(data)\n plt.figure()\n for label in df.columns:\n plt.plot(df[label], label=label)\n plt.xlabel(\"Time\")\n plt.ylabel(\"Data Points\")\n plt.title(\"Data over Time\")\n return plt.gca()", "test": "import unittest\nimport matplotlib\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.data1 = [\n {\"A\": 10, \"B\": 15, \"C\": 12},\n {\"A\": 12, \"B\": 20, \"C\": 14},\n {\"A\": 15, \"B\": 18, \"C\": 15},\n {\"A\": 11, \"B\": 17, \"C\": 13},\n ]\n cls.data2 = [\n {\"X\": 5, \"Y\": 8},\n {\"X\": 6, \"Y\": 7},\n {\"X\": 7, \"Y\": 6},\n {\"X\": 8, \"Y\": 5},\n ]\n cls.data3 = [{\"P\": 3, \"Q\": 2, \"R\": 4, \"S\": 1}, {\"P\": 4, \"Q\": 3, \"R\": 2, \"S\": 3}]\n cls.data4 = [{\"W\": 7}, {\"W\": 8}, {\"W\": 9}, {\"W\": 6}]\n cls.data5 = [{\"M\": 1, \"N\": 3}, {\"M\": 3, \"N\": 1}]\n def test_case_1(self):\n # Test for correct Axes instance and labels for a typical data set\n ax = f_411(self.data1)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(ax.get_title(), \"Data over Time\")\n self.assertEqual(ax.get_xlabel(), \"Time\")\n self.assertEqual(ax.get_ylabel(), \"Data Points\")\n self.assertEqual(len(ax.lines), 3)\n def test_case_2(self):\n # Test for different keys across dictionaries in data list\n data = [{\"A\": 1, \"B\": 2}, {\"B\": 3, \"C\": 4}, {\"A\": 5, \"C\": 6}]\n ax = f_411(data)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertTrue(len(ax.lines) > 0)\n def test_case_3(self):\n # Test with empty data list\n self.assertIsNone(f_411([]))\n def test_case_4(self):\n # Test with data containing non-numeric values\n data = [{\"A\": \"text\", \"B\": \"more text\"}, {\"A\": 1, \"B\": 2}]\n with self.assertRaises(TypeError):\n f_411(data)\n def test_case_5(self):\n # Test with a single entry in the data list\n data = [{\"A\": 1, \"B\": 2}]\n ax = f_411(data)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines), 2)\n def test_case_6(self):\n # Test focusing on data processing correctness\n data = [\n {\"A\": 10, \"B\": 15, \"C\": 12},\n {\"A\": 12, \"B\": 20, \"C\": 14},\n {\"A\": 15, \"B\": 18, \"C\": 15},\n {\"A\": 11, \"B\": 17, \"C\": 13},\n ]\n ax = f_411(data)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n # Convert input data to DataFrame for easy comparison\n input_df = pd.DataFrame(data)\n # Iterate through each line in the plot and check against the input data\n for line in ax.lines:\n label = line.get_label()\n _, y_data = line.get_data()\n expected_y_data = input_df[label].values\n # Use numpy to compare the y_data from plot and expected data from input\n np.testing.assert_array_equal(\n y_data, expected_y_data, err_msg=f\"Data mismatch for label {label}\"\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.xlabel", "matplotlib.pyplot.plot", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.figure", "matplotlib.pyplot.gca", "pandas.DataFrame"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Combine a list of dictionaries with the same keys into a single dictionary, turn it into a", "Pandas DataFrame and create a line plot of the data."], "note": [], "params": ["data (list): A list of dictionaries. The keys are labels and the values are data points."], "returns": ["matplotlib.axes._subplots.AxesSubplot or None: Axes object of the plot showing 'Data over Time',", "with 'Time' on the x-axis and 'Data Points' on the y-axis.", "If data is empty, return None."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_411([{'A': 10, 'B': 15, 'C': 12},\\", "{'A': 12, 'B': 20, 'C': 14},\\", "{'A': 15, 'B': 18, 'C': 15},\\", "{'A': 11, 'B': 17, 'C': 13}])", ">>> type(ax)", "", ">>> ax.get_title()", "'Data over Time'", ">>> len(ax.lines)", "3"]}} -{"task_id": "f_379", "prompt": "import pandas as pd\nimport random\nimport re\n\n\ndef f_379(data_list, seed=42):\n \"\"\"\n Randomizes the order of comma-separated substrings within each string in a list,\n normalizing spaces to ensure a single space follows each comma using regex, then\n returns a DataFrame comparing original and randomized strings.\n\n Parameters:\n data_list (list of str): List of strings with substrings to be randomized.\n seed (int, optional): Seed for random number generator for reproducibility. Defaults to None.\n\n Returns:\n pandas.DataFrame: A DataFrame with columns 'Original String' and 'Randomized String'.\n\n Requirements:\n - pandas\n - random\n - re\n\n Example:\n >>> df = f_379(['lamp, bag, mirror', 'table, chair, bag'], seed=42)\n >>> df['Original String'][0]\n 'lamp, bag, mirror'\n >>> df['Randomized String'][0]\n 'mirror, lamp, bag'\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n df = pd.DataFrame(data_list, columns=[\"Original String\"])\n\n randomized_strings = []\n for s in data_list:\n substrings = re.split(\"\\s*,\\s*\", s)\n random_positions = random.sample(range(len(substrings)), len(substrings))\n randomized_s = \", \".join([substrings[i] for i in random_positions])\n randomized_strings.append(randomized_s)\n\n df[\"Randomized String\"] = randomized_strings\n\n return df", "test": "import unittest\nimport pandas as pd\nimport re\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic functionality with a reproducible seed\n input_data = [\"a, b\", \"c, d, e\"]\n df = f_379(input_data, seed=42)\n self.assertEqual(len(df), 2)\n self.assertListEqual(df[\"Original String\"].tolist(), input_data)\n self.assertNotEqual(\n df[\"Original String\"].tolist(), df[\"Randomized String\"].tolist()\n )\n self.assertSetEqual(\n set(df[\"Original String\"].tolist()[0].split(\", \")),\n set(df[\"Randomized String\"].tolist()[0].split(\", \")),\n )\n def test_case_2(self):\n # Test function's behavior with an empty input list\n input_data = []\n df = f_379(input_data)\n self.assertEqual(len(df), 0)\n def test_case_3(self):\n # Test with single items (no commas) to verify output matches input exactly\n input_data = [\"a\", \"b\", \"c\"]\n df = f_379(input_data)\n self.assertListEqual(\n df[\"Original String\"].tolist(), df[\"Randomized String\"].tolist()\n )\n def test_case_4(self):\n # Test with strings containing only commas\n input_data = [\",,,\", \",,\"]\n expected_output = [\", , , \", \", , \"]\n df = f_379(input_data)\n self.assertTrue(\n all(df[\"Randomized String\"].apply(lambda x: x in expected_output))\n )\n def test_case_5(self):\n # Test strings with inconsistent use of spaces and delimiters\n input_data = [\"a,b, c\", \"d ,e, f\"] # Inputs with inconsistent spacing\n df = f_379(input_data, seed=24)\n for i in range(len(input_data)):\n original_substrings = set(re.split(\"\\s*,\\s*\", input_data[i]))\n randomized_substrings = set(df[\"Randomized String\"].iloc[i].split(\", \"))\n self.assertEqual(\n original_substrings,\n randomized_substrings,\n )\n def test_case_6(self):\n # Test with strings that include special characters\n input_data = [\"!@#, $%^\", \"&*(), )(_+\"]\n df = f_379(input_data, seed=99)\n self.assertEqual(len(df), 2)\n for orig, rand in zip(df[\"Original String\"], df[\"Randomized String\"]):\n self.assertSetEqual(set(orig.split(\", \")), set(rand.split(\", \")))\n def test_case_7(self):\n # Test random seed\n input_data = [\"lamp, bag, mirror\", \"table, chair, vase\"]\n df1 = f_379(input_data, seed=42)\n df2 = f_379(input_data, seed=42)\n self.assertListEqual(\n df1[\"Randomized String\"].tolist(), df2[\"Randomized String\"].tolist()\n )\n def test_case_8(self):\n # Test the handling of non-standard separators\n input_data = [\"a;b;c\", \"d:e:f\"]\n df = f_379(input_data)\n self.assertListEqual(\n df[\"Original String\"].tolist(), df[\"Randomized String\"].tolist()\n )\n def test_case_9(self):\n ## Test handling of strings with commas not followed by spaces\n input_data = [\"a,b,c\", \"d,e,f\"]\n df = f_379(input_data, seed=42)\n for idx in range(len(input_data)):\n original_substrings = set(re.split(\",\\s*\", input_data[idx].strip()))\n randomized_substrings = set(df[\"Randomized String\"].iloc[idx].split(\", \"))\n self.assertEqual(\n original_substrings,\n randomized_substrings,\n \"Substrings should be preserved and normalized after randomization.\",\n )\n def test_case_10(self):\n # Test handling of strings with leading or trailing spaces\n input_data = [\" a, b, c \", \" d, e, f \"]\n df = f_379(input_data, seed=42)\n for idx in range(len(input_data)):\n original_substrings = set(\n x.strip() for x in re.split(\",\\s*\", input_data[idx].strip())\n )\n randomized_substrings = set(\n x.strip() for x in df[\"Randomized String\"].iloc[idx].split(\", \")\n )\n self.assertEqual(\n original_substrings,\n randomized_substrings,\n \"Ensure substrings match after randomization, ignoring leading/trailing spaces.\",\n )\n def test_case_11(self):\n # Test handling of strings with multiple spaces after a comma\n input_data = [\"a, b, c\", \"d, e, f\"]\n df = f_379(input_data, seed=42)\n for rand_str in df[\"Randomized String\"].tolist():\n self.assertTrue(\n \", \" not in rand_str\n and \", \" not in rand_str\n and \", \" not in rand_str,\n \"Multiple spaces after commas should not appear in output.\",\n )", "apis": ["re.split", "random.sample", "pandas.DataFrame", "random.seed"], "libs": ["pandas", "re", "random"], "doc": {"description": ["Randomizes the order of comma-separated substrings within each string in a list,", "normalizing spaces to ensure a single space follows each comma using regex, then", "returns a DataFrame comparing original and randomized strings."], "note": [], "params": ["data_list (list of str): List of strings with substrings to be randomized.", "seed (int, optional): Seed for random number generator for reproducibility. Defaults to None."], "returns": ["pandas.DataFrame: A DataFrame with columns 'Original String' and 'Randomized String'."], "reqs": ["pandas", "random", "re"], "raises": [], "example": [">>> df = f_379(['lamp, bag, mirror', 'table, chair, bag'], seed=42)", ">>> df['Original String'][0]", "'lamp, bag, mirror'", ">>> df['Randomized String'][0]", "'mirror, lamp, bag'"]}} -{"task_id": "f_824", "prompt": "import pandas as pd\nfrom datetime import datetime\nimport random\n\n\ndef f_824(start_date, end_date, num_series, seed=None):\n \"\"\"\n Generates a DataFrame with multiple random integer time series (each ranging\n from 0 to 100) from a start date to an end date, then returns the generated time series\n on a line plot.\n\n Parameters:\n - start_date (str): The start date in \"yyyy-mm-dd\" format.\n - end_date (str): The end date in \"yyyy-mm-dd\" format.\n - num_series (int): The number of random time series to generate.\n - seed (int, optional): Seed for the random number generator. Defaults to None (not set).\n\n Returns:\n - pandas.DataFrame: A pandas DataFrame containing the generated time series, indexed by date.\n - plt.Axes: A matplotlib line plot of the time series.\n\n Raises:\n - ValueError: If start_date is later than end_date; or if num_series is less than 1.\n\n Requirements:\n - pandas\n - datetime\n - random\n\n Notes:\n - The line plot's title is set to \"Random Time Series\", the x-axis label to \"Date\",\n and the y-axis label to \"Value\".\n - Each time series is plotted as a separate line with automatic coloring and legend\n entry labeled as \"series_x\" where x is the series number.\n\n Example:\n >>> df, ax = f_824('2020-01-01', '2020-12-31', 3, 42)\n >>> df.head(2)\n series_1 series_2 series_3\n 2020-01-01 81 67 19\n 2020-01-02 14 20 29\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n start_date_dt = datetime.strptime(start_date, \"%Y-%m-%d\")\n end_date_dt = datetime.strptime(end_date, \"%Y-%m-%d\")\n if start_date_dt > end_date_dt:\n raise ValueError(\"start_date must be earlier than or equal to end_date.\")\n if num_series < 1:\n raise ValueError(\"num_series must be at least 1.\")\n\n date_range = pd.date_range(start_date_dt, end_date_dt)\n\n data = {}\n for i in range(num_series):\n series_name = f\"series_{i+1}\"\n data[series_name] = [random.randint(0, 100) for _ in range(len(date_range))]\n\n df = pd.DataFrame(data, index=date_range)\n\n ax = df.plot()\n ax.set_title(\"Random Time Series\")\n ax.set_xlabel(\"Date\")\n ax.set_ylabel(\"Value\")\n\n return df, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib\nimport warnings\nclass TestCases(unittest.TestCase):\n def test_valid_input(self):\n \"\"\"Tests correct DataFrame structure and plot type with valid inputs.\"\"\"\n df, ax = f_824(\"2022-01-01\", \"2022-01-10\", 2, seed=42)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.shape[1], 2)\n self.assertEqual(len(df.index), 10)\n self.assertIsInstance(ax, matplotlib.axes._axes.Axes)\n self.assertTrue((df <= 100).all().all() and (df >= 0).all().all())\n def test_seed_reproducibility(self):\n \"\"\"Tests if providing a seed results in reproducible outputs.\"\"\"\n df1, _ = f_824(\"2022-01-01\", \"2022-01-05\", 1, seed=42)\n df2, _ = f_824(\"2022-01-01\", \"2022-01-05\", 1, seed=42)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertTrue((df1 <= 100).all().all() and (df1 >= 0).all().all())\n def test_negative_num_series(self):\n \"\"\"Tests if function raises an error when num_series is less than 1.\"\"\"\n with self.assertRaises(ValueError):\n f_824(\"2022-01-01\", \"2022-01-10\", 0)\n def test_start_date_after_end_date(self):\n \"\"\"Tests if function raises an error when start date is after end date.\"\"\"\n with self.assertRaises(ValueError):\n f_824(\"2022-01-10\", \"2022-01-01\", 1)\n def test_single_day_series(self):\n \"\"\"Tests DataFrame structure and plot type when start and end dates are the same.\"\"\"\n with warnings.catch_warnings():\n warnings.simplefilter(\"ignore\", category=UserWarning)\n df, ax = f_824(\"2022-07-01\", \"2022-07-01\", 1, seed=42)\n self.assertEqual(len(df.index), 1)\n self.assertIsInstance(ax, matplotlib.axes._axes.Axes)\n self.assertTrue((df <= 100).all().all() and (df >= 0).all().all())\n def test_multiple_series_names(self):\n \"\"\"Tests if the generated DataFrame contains correct series names.\"\"\"\n df, _ = f_824(\"2022-01-01\", \"2022-01-05\", 3, seed=42)\n expected_columns = [\"series_1\", \"series_2\", \"series_3\"]\n self.assertListEqual(list(df.columns), expected_columns)\n self.assertTrue((df <= 100).all().all() and (df >= 0).all().all())\n def test_plot_attributes(self):\n \"\"\"Tests the attributes of the plot, including title, x-label, and y-label.\"\"\"\n _, ax = f_824(\"2022-01-01\", \"2022-01-05\", 2, seed=42)\n self.assertEqual(ax.get_title(), \"Random Time Series\")\n self.assertEqual(ax.get_xlabel(), \"Date\")\n self.assertEqual(ax.get_ylabel(), \"Value\")\n self.assertTrue(len(ax.lines) == 2)", "apis": ["random.randint", "random.seed", "datetime.datetime.strptime", "pandas.date_range", "pandas.DataFrame"], "libs": ["pandas", "random", "datetime"], "doc": {"description": ["Generates a DataFrame with multiple random integer time series (each ranging", "from 0 to 100) from a start date to an end date, then returns the generated time series", "on a line plot.", "Notes:", "- The line plot's title is set to \"Random Time Series\", the x-axis label to \"Date\",", "and the y-axis label to \"Value\".", "- Each time series is plotted as a separate line with automatic coloring and legend", "entry labeled as \"series_x\" where x is the series number."], "note": [], "params": ["start_date (str): The start date in \"yyyy-mm-dd\" format.", "end_date (str): The end date in \"yyyy-mm-dd\" format.", "num_series (int): The number of random time series to generate.", "seed (int, optional): Seed for the random number generator. Defaults to None (not set)."], "returns": ["pandas.DataFrame: A pandas DataFrame containing the generated time series, indexed by date.", "plt.Axes: A matplotlib line plot of the time series."], "reqs": ["pandas", "datetime", "random"], "raises": ["ValueError: If start_date is later than end_date; or if num_series is less than 1."], "example": [">>> df, ax = f_824('2020-01-01', '2020-12-31', 3, 42)", ">>> df.head(2)", "series_1 series_2 series_3", "2020-01-01 81 67 19", "2020-01-02 14 20 29"]}} -{"task_id": "f_406", "prompt": "import pandas as pd\nfrom scipy.spatial.distance import pdist, squareform\n\n\ndef f_406(array):\n \"\"\"\n Generate a Pandas DataFrame from a 2D list and calculate a distance matrix.\n\n This function converts a 2D list into a DataFrame, with columns named alphabetically starting from 'A'.\n It uses the `chr()` function, which converts an integer to its corresponding Unicode character,\n to dynamically assign alphabetical labels to each column based on their index. The function then\n computes the Euclidean distance matrix between rows.\n\n Parameters:\n array (list of list of int): The 2D list representing the data.\n Each sublist must contain only integers or floats. If the input does not\n conform to this structure, a TypeError is raised.\n\n Returns:\n - df (pd.DataFrame): data converted from 2D list.\n - distance_matrix (pd.DataFrame): output distance matrix.\n\n Requirements:\n - pandas\n - scipy.spatial.distance.pdist\n - scipy.spatial.distance.squareform\n\n Example:\n >>> df, distance_matrix = f_406([[1,2,3,4,5], [6,7,8,9,10]])\n >>> print(df)\n A B C D E\n 0 1 2 3 4 5\n 1 6 7 8 9 10\n >>> print(distance_matrix)\n 0 1\n 0 0.00000 11.18034\n 1 11.18034 0.00000\n \"\"\"", "canonical_solution": " if not isinstance(array, list):\n raise TypeError(\"Input must be a list.\")\n\n if not all(isinstance(sublist, list) for sublist in array):\n raise TypeError(\"Input must be a list of lists.\")\n\n for sublist in array:\n if not all(isinstance(item, (int, float)) for item in sublist):\n raise TypeError(\"All elements in the sublists must be int or float.\")\n\n columns = [chr(65 + i) for i in range(len(array[0]))]\n df = pd.DataFrame(array, columns=columns)\n\n distances = pdist(df.values, metric=\"euclidean\")\n distance_matrix = pd.DataFrame(\n squareform(distances), index=df.index, columns=df.index\n )\n\n return df, distance_matrix", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Teset basic case\n input_data = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (2, 5))\n self.assertTrue((df.columns == [\"A\", \"B\", \"C\", \"D\", \"E\"]).all())\n self.assertEqual(distance_matrix.shape, (2, 2))\n self.assertAlmostEqual(distance_matrix.iloc[0, 1], 11.18034, places=5)\n self.assertAlmostEqual(distance_matrix.iloc[1, 0], 11.18034, places=5)\n def test_case_2(self):\n # Test negatives and zero\n input_data = [[-5, -4, -3, -2, -1], [0, 0, 0, 0, 0], [1, 2, 3, 4, 5]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (3, 5))\n self.assertEqual(distance_matrix.shape, (3, 3))\n self.assertAlmostEqual(distance_matrix.iloc[0, 1], 7.41620, places=5)\n self.assertAlmostEqual(distance_matrix.iloc[1, 2], 7.41620, places=5)\n def test_case_3(self):\n # Test small lists\n input_data = [[1, 2], [3, 4]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (2, 2))\n self.assertEqual(distance_matrix.shape, (2, 2))\n self.assertAlmostEqual(distance_matrix.iloc[0, 1], 2.82843, places=5)\n def test_case_4(self):\n # Test repeated single element\n input_data = [[5, 5, 5], [5, 5, 5], [5, 5, 5]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (3, 3))\n self.assertEqual(distance_matrix.shape, (3, 3))\n self.assertEqual(distance_matrix.iloc[0, 1], 0)\n self.assertEqual(distance_matrix.iloc[1, 2], 0)\n def test_case_5(self):\n # Test single list\n input_data = [[1, 2, 3, 4, 5]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (1, 5))\n self.assertEqual(distance_matrix.shape, (1, 1))\n self.assertEqual(distance_matrix.iloc[0, 0], 0)\n def test_case_6(self):\n # Test empty list\n input_data = []\n with self.assertRaises(IndexError):\n f_406(input_data)\n def test_case_7(self):\n # Test larger dataset\n input_data = [list(range(100)) for _ in range(50)]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (50, 100))\n self.assertEqual(distance_matrix.shape, (50, 50))\n # No specific values check due to complexity\n def test_case_8(self):\n # Test single element list\n input_data = [[1]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (1, 1))\n self.assertEqual(distance_matrix.shape, (1, 1))\n self.assertEqual(distance_matrix.iloc[0, 0], 0)\n def test_case_9(self):\n # Test with different types in list\n input_data = [[1, 2, 3], [\"a\", \"b\", \"c\"]]\n with self.assertRaises(TypeError):\n f_406(input_data)\n def test_case_10(self):\n # Test with a more complex numerical list (including floats and negatives)\n input_data = [[-1.5, 2.3, 4.5], [0, 0, 0], [5.5, -2.3, 3.1]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (3, 3))\n self.assertEqual(distance_matrix.shape, (3, 3))\n # Define expected distances based on manual or precise calculation\n expected_distances = [\n [0.0, 5.27162, 8.49235],\n [5.27162, 0.0, 6.71937],\n [8.49235, 6.71937, 0.0],\n ]\n # Assert each calculated distance matches the expected value\n for i in range(len(expected_distances)):\n for j in range(len(expected_distances[i])):\n self.assertAlmostEqual(\n distance_matrix.iloc[i, j], expected_distances[i][j], places=5\n )", "apis": ["scipy.spatial.distance.pdist", "pandas.DataFrame", "scipy.spatial.distance.squareform"], "libs": ["pandas", "scipy"], "doc": {"description": ["Generate a Pandas DataFrame from a 2D list and calculate a distance matrix.", "This function converts a 2D list into a DataFrame, with columns named alphabetically starting from 'A'.", "It uses the `chr()` function, which converts an integer to its corresponding Unicode character,", "to dynamically assign alphabetical labels to each column based on their index. The function then", "computes the Euclidean distance matrix between rows."], "note": [], "params": ["array (list of list of int): The 2D list representing the data.", "Each sublist must contain only integers or floats. If the input does not", "conform to this structure, a TypeError is raised."], "returns": ["df (pd.DataFrame): data converted from 2D list.", "distance_matrix (pd.DataFrame): output distance matrix."], "reqs": ["pandas", "scipy.spatial.distance.pdist", "scipy.spatial.distance.squareform"], "raises": [], "example": [">>> df, distance_matrix = f_406([[1,2,3,4,5], [6,7,8,9,10]])", ">>> print(df)", "A B C D E", "0 1 2 3 4 5", "1 6 7 8 9 10", ">>> print(distance_matrix)", "0 1", "0 0.00000 11.18034", "1 11.18034 0.00000"]}} -{"task_id": "f_888", "prompt": "import pandas as pd\nfrom datetime import datetime\nimport matplotlib.pyplot as plt\n\n# Constants\nROOMS = [\"Room1\", \"Room2\", \"Room3\", \"Room4\", \"Room5\"]\n\ndef f_888(date_str, booking_data):\n \"\"\"\n This function generates a status report of room bookings for a specified date\n and displays a bar plot representing the booking statuses of various rooms.\n It validates the provided date, compiles a booking status report, and visualizes\n the data in a bar plot.\n\n Parameters:\n - date_str (str): The date for which the booking status needs to be checked,\n in \"yyyy-mm-dd\" format. The function validates this date.\n - booking_data (dict): A dictionary with room names as keys and booking statuses\n as values. The keys should match the rooms listed in the ROOMS constant.\n\n Returns:\n - DataFrame: A pandas DataFrame containing booking status for each room.\n - plt.Axes: A matplotlib Axes object for the bar plot of booking statuses.\n\n Raises:\n - ValueError: Raised in two scenarios:\n 1. If `date_str` does not follow the \"yyyy-mm-dd\" format or is not a valid date.\n 2. If `date_str` refers to a past date.\n\n Requirements:\n - pandas\n - datetime\n\n Example:\n >>> from datetime import datetime\n >>> future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n >>> booking_info = {\"Room1\": \"Booked\", \"Room2\": \"Available\"}\n >>> report_df, ax = f_888(future_date, booking_info)\n >>> print(report_df)\n Room Booking Status\n 0 Room1 Booked\n 1 Room2 Available\n 2 Room3 Not Listed\n 3 Room4 Not Listed\n 4 Room5 Not Listed\n \"\"\"", "canonical_solution": " # Validate the date string\n try:\n date = datetime.strptime(date_str, \"%Y-%m-%d\")\n if date < datetime.now():\n raise ValueError(\"Date is in the past. Please provide a future date.\")\n except ValueError as e:\n raise ValueError(f\"Invalid date: {e}\") from e\n\n report_data = [[room, booking_data.get(room, \"Not Listed\")] for room in ROOMS]\n report_df = pd.DataFrame(report_data, columns=[\"Room\", \"Booking Status\"])\n\n # Create a bar plot of the booking statuses\n ax = (\n report_df[\"Booking Status\"]\n .value_counts()\n .plot(kind=\"bar\", title=\"Booking Statuses for \" + date_str)\n )\n\n return report_df, ax", "test": "import unittest\nimport pandas as pd\nfrom datetime import datetime, timedelta\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_888\"\"\"\n def test_future_date_valid_booking_data(self):\n \"\"\"\n Test f_888 with a future date and valid booking data.\n \"\"\"\n future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n booking_data = {\"Room1\": \"Booked\", \"Room2\": \"Available\"}\n report_df, _ = f_888(future_date, booking_data)\n self.assertIn(\"Room1\", report_df[\"Room\"].values)\n self.assertIn(\"Booked\", report_df[\"Booking Status\"].values)\n def test_past_date(self):\n \"\"\"\n Test f_888 with a past date to ensure it raises a ValueError.\n \"\"\"\n past_date = \"2020-01-01\"\n booking_data = {\"Room1\": \"Booked\"}\n with self.assertRaises(ValueError):\n f_888(past_date, booking_data)\n def test_invalid_date_format(self):\n \"\"\"\n Test f_888 with an invalid date format to check for ValueError.\n \"\"\"\n invalid_date = \"15-06-2023\"\n booking_data = {\"Room1\": \"Booked\"}\n with self.assertRaises(ValueError):\n f_888(invalid_date, booking_data)\n def test_booking_data_for_nonexistent_room(self):\n \"\"\"\n Test f_888 with booking data for a room not in the ROOMS constant.\n \"\"\"\n future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n booking_data = {\"Room6\": \"Booked\"}\n report_df, _ = f_888(future_date, booking_data)\n self.assertIn(\"Not Listed\", report_df[\"Booking Status\"].values)\n def test_no_booking_data(self):\n \"\"\"\n Test f_888 with no booking data provided.\n \"\"\"\n future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n booking_data = {}\n report_df, _ = f_888(future_date, booking_data)\n self.assertTrue((report_df[\"Booking Status\"] == \"Not Listed\").all())\n def tearDown(self):\n plt.clf()", "apis": ["datetime.datetime.now", "pandas.DataFrame", "datetime.datetime.strptime"], "libs": ["pandas", "datetime"], "doc": {"description": ["This function generates a status report of room bookings for a specified date", "and displays a bar plot representing the booking statuses of various rooms.", "It validates the provided date, compiles a booking status report, and visualizes", "the data in a bar plot."], "note": [], "params": ["date_str (str): The date for which the booking status needs to be checked,", "in \"yyyy-mm-dd\" format. The function validates this date.", "booking_data (dict): A dictionary with room names as keys and booking statuses", "as values. The keys should match the rooms listed in the ROOMS constant."], "returns": ["DataFrame: A pandas DataFrame containing booking status for each room.", "plt.Axes: A matplotlib Axes object for the bar plot of booking statuses."], "reqs": ["pandas", "datetime"], "raises": ["ValueError: Raised in two scenarios:", "1. If `date_str` does not follow the \"yyyy-mm-dd\" format or is not a valid date.", "2. If `date_str` refers to a past date."], "example": [">>> from datetime import datetime", ">>> future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")", ">>> booking_info = {\"Room1\": \"Booked\", \"Room2\": \"Available\"}", ">>> report_df, ax = f_888(future_date, booking_info)", ">>> print(report_df)", "Room Booking Status", "0 Room1 Booked", "1 Room2 Available", "2 Room3 Not Listed", "3 Room4 Not Listed", "4 Room5 Not Listed"]}} -{"task_id": "f_892", "prompt": "from datetime import datetime\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_892(date_str):\n \"\"\"\n Plot a sine wave whose frequency is determined by the day of the month from the given date.\n\n Parameters:\n date_str (str): A date in \"yyyy-mm-dd\" format, used to determine the frequency of the sine wave.\n\n Returns:\n matplotlib.axes.Axes: An Axes object containing the plotted sine wave.\n\n Requirements:\n - datetime.datetime\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_892('2023-06-15')\n >>> print(ax.get_title())\n Sine Wave for 2023-06-15 (Frequency: 15)\n \"\"\"", "canonical_solution": " date = datetime.strptime(date_str, \"%Y-%m-%d\")\n x = np.linspace(0, 2 * np.pi, 1000)\n frequency = date.day\n y = np.sin(frequency * x)\n _, ax = plt.subplots()\n ax.plot(x, y)\n ax.set_title(f\"Sine Wave for {date_str} (Frequency: {frequency})\")\n return ax", "test": "import unittest\nimport matplotlib\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_892.\"\"\"\n def test_valid_date(self):\n \"\"\"\n Test with a valid date string to ensure the function returns a matplotlib Axes object.\n \"\"\"\n result = f_892(\"2023-06-15\")\n self.assertIsInstance(result, matplotlib.axes.Axes)\n def test_leap_year_date(self):\n \"\"\"\n Test with a date from a leap year to check the function's handling of leap years.\n \"\"\"\n result = f_892(\"2024-02-29\")\n self.assertIsInstance(result, matplotlib.axes.Axes)\n def test_beginning_of_month(self):\n \"\"\"\n Test with a date at the beginning of the month (low-frequency wave).\n \"\"\"\n result = f_892(\"2023-01-01\")\n self.assertIsInstance(result, matplotlib.axes.Axes)\n def test_end_of_month(self):\n \"\"\"\n Test with a date towards the end of the month (high-frequency wave).\n \"\"\"\n result = f_892(\"2023-01-31\")\n self.assertIsInstance(result, matplotlib.axes.Axes)\n def test_invalid_date_format(self):\n \"\"\"\n Test with an invalid date format to check if the function raises a ValueError.\n \"\"\"\n with self.assertRaises(ValueError):\n f_892(\"15-06-2023\")\n def tearDown(self):\n plt.close()", "apis": ["numpy.sin", "numpy.pi", "datetime.datetime.strptime", "numpy.linspace", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "datetime"], "doc": {"description": ["Plot a sine wave whose frequency is determined by the day of the month from the given date."], "note": [], "params": ["date_str (str): A date in \"yyyy-mm-dd\" format, used to determine the frequency of the sine wave."], "returns": ["matplotlib.axes.Axes: An Axes object containing the plotted sine wave."], "reqs": ["datetime.datetime", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_892('2023-06-15')", ">>> print(ax.get_title())", "Sine Wave for 2023-06-15 (Frequency: 15)"]}} +{"task_id": "f_340", "prompt": "import pandas as pd\nfrom random import randint\n\n\ndef f_340(name: str, age: int, code: str, salary: float, bio: str) -> pd.DataFrame:\n \"\"\"\n Generate a Pandas DataFrame of employees with their details based on the input provided.\n\n Parameters:\n - name (str): Name of the employee. This is case-sensitive. Must be one of the predefined\n names: 'John', 'Alice', 'Bob', 'Charlie', 'David', otherwise the function raises\n ValueError.\n - age (int): Age of the employee.\n - code (str): Code of the employee.\n - salary (float): Salary of the employee.\n - bio (str): Biography of the employee.\n\n Returns:\n data_df (pd.DataFrame): dataframe with columns: 'Name', 'Age', 'Code', 'Salary', 'Bio', 'Job Title'.\n The 'Job Title' is randomly assigned from the predefined job titles:\n 'Engineer', 'Manager', 'Analyst', 'Developer', 'Tester'.\n\n Requirements:\n - pandas\n - random.randint\n\n Example:\n >>> random.seed(0)\n >>> df = f_340(\"John\", 30, \"A10B\", 5000.0, \"This is a bio with spaces\")\n >>> print(df)\n Name Age Code Salary Bio Job Title\n 0 John 30 A10B 5000.0 This is a bio with spaces Developer\n \"\"\"", "canonical_solution": " EMPLOYEES = [\"John\", \"Alice\", \"Bob\", \"Charlie\", \"David\"]\n JOBS = [\"Engineer\", \"Manager\", \"Analyst\", \"Developer\", \"Tester\"]\n\n if name not in EMPLOYEES:\n raise ValueError(f\"Invalid employee name. Must be one of {EMPLOYEES}\")\n\n job = JOBS[randint(0, len(JOBS) - 1)]\n data_df = pd.DataFrame(\n [[name, age, code, salary, bio, job]],\n columns=[\"Name\", \"Age\", \"Code\", \"Salary\", \"Bio\", \"Job Title\"],\n )\n return data_df", "test": "import unittest\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test the DataFrame structure for a known input\n df = f_340(\"John\", 30, \"A10B\", 5000.0, \"Sample bio\")\n expected_columns = [\"Name\", \"Age\", \"Code\", \"Salary\", \"Bio\", \"Job Title\"]\n self.assertListEqual(\n list(df.columns), expected_columns, \"DataFrame columns mismatch\"\n )\n for col, dtype in zip(\n df.columns, [\"object\", \"int64\", \"object\", \"float64\", \"object\", \"object\"]\n ):\n self.assertTrue(\n df[col].dtype == dtype,\n f\"Column {col} has incorrect type {df[col].dtype}\",\n )\n def test_case_2(self):\n # Test minimum and maximum valid ages and salary, including edge cases\n df_min_age = f_340(\"Alice\", 18, \"X10Y\", 0.0, \"Minimum age and salary\")\n self.assertEqual(df_min_age[\"Age\"][0], 18)\n self.assertEqual(df_min_age[\"Salary\"][0], 0.0)\n df_max_age = f_340(\"Bob\", 65, \"Z99W\", 1000000.0, \"Maximum age and high salary\")\n self.assertEqual(df_max_age[\"Age\"][0], 65)\n self.assertEqual(df_max_age[\"Salary\"][0], 1000000.0)\n def test_case_3(self):\n # Test bio with special characters, very long string, and empty string\n df_special_bio = f_340(\"Charlie\", 30, \"C30D\", 5300.0, \"!@#$%^&*()_+|\")\n self.assertEqual(df_special_bio[\"Bio\"][0], \"!@#$%^&*()_+|\")\n df_long_bio = f_340(\"David\", 30, \"D40E\", 5400.5, \"a\" * 1000)\n self.assertEqual(len(df_long_bio[\"Bio\"][0]), 1000)\n df_empty_bio = f_340(\"John\", 30, \"E50F\", 5500.0, \"\")\n self.assertEqual(df_empty_bio[\"Bio\"][0], \"\")\n def test_case_4(self):\n # Test code with different formats\n df_code_special_chars = f_340(\n \"Alice\", 25, \"!@#$\", 5500.5, \"Bio with special char code\"\n )\n self.assertEqual(df_code_special_chars[\"Code\"][0], \"!@#$\")\n def test_case_5(self):\n # Test for case sensitivity\n with self.assertRaises(ValueError):\n f_340(\"john\", 30, \"J01K\", 5000.0, \"Case sensitive name test\")\n def test_case_6(self):\n # Test each predefined name\n for name in [\"John\", \"Alice\", \"Bob\", \"Charlie\", \"David\"]:\n df = f_340(name, 30, \"A10B\", 5000.0, f\"{name}'s bio\")\n self.assertEqual(\n df[\"Name\"][0], name, f\"Valid name {name} failed to create a DataFrame\"\n )\n def test_case_7(self):\n # Test randomness in job assignment\n job_titles_first_run = []\n job_titles_second_run = []\n job_titles_third_run = []\n n_iter = 15\n name, age, code, salary, bio = (\n \"Bob\",\n 30,\n \"B20C\",\n 5000.0,\n \"Testing randomness in job titles\",\n )\n random.seed(42) # Set the seed for the first run\n for _ in range(n_iter):\n df = f_340(name, age, code, salary, bio)\n job_titles_first_run.append(df[\"Job Title\"][0])\n random.seed(42) # Reset the seed to ensure reproducibility for the second run\n for _ in range(n_iter):\n df = f_340(name, age, code, salary, bio)\n job_titles_second_run.append(df[\"Job Title\"][0])\n random.seed(0) # Repeat for third run with different seed\n for _ in range(n_iter):\n df = f_340(name, age, code, salary, bio)\n job_titles_third_run.append(df[\"Job Title\"][0])\n self.assertEqual(job_titles_first_run, job_titles_second_run)\n self.assertNotEqual(job_titles_first_run, job_titles_third_run)\n def test_case_8(self):\n # Test invalid name\n with self.assertRaises(ValueError):\n f_340(\"InvalidName\", 28, \"C30D\", 5300.0, \"Bio of InvalidName\")", "apis": ["pandas.DataFrame", "random.randint"], "libs": ["random", "pandas"], "doc": {"description": ["Generate a Pandas DataFrame of employees with their details based on the input provided."], "note": [], "params": ["name (str): Name of the employee. This is case-sensitive. Must be one of the predefined", "names: 'John', 'Alice', 'Bob', 'Charlie', 'David', otherwise the function raises", "ValueError.", "age (int): Age of the employee.", "code (str): Code of the employee.", "salary (float): Salary of the employee.", "bio (str): Biography of the employee."], "returns": ["data_df (pd.DataFrame): dataframe with columns: 'Name', 'Age', 'Code', 'Salary', 'Bio', 'Job Title'.", "The 'Job Title' is randomly assigned from the predefined job titles:", "'Engineer', 'Manager', 'Analyst', 'Developer', 'Tester'."], "reqs": ["pandas", "random.randint"], "raises": [], "example": [">>> random.seed(0)", ">>> df = f_340(\"John\", 30, \"A10B\", 5000.0, \"This is a bio with spaces\")", ">>> print(df)", "Name Age Code Salary Bio Job Title", "0 John 30 A10B 5000.0 This is a bio with spaces Developer"]}} +{"task_id": "f_360", "prompt": "import json\nimport re\nimport pandas as pd\n\n\ndef f_360(json_str):\n \"\"\"\n Load a JSON string into a dictionary, normalize the dictionary by doubling the numerical values,\n and then create a Pandas DataFrame from the dictionary.\n\n This function processes a JSON string by converting it into a dictionary, normalizes the data\n by doubling the numerical values, and then constructs a Pandas DataFrame from this dictionary.\n Note: the function is designed to handle simple flat dictionaries, with values that are either\n single numerical values, lists of numerical values, or strings that can be interpreted as\n numbers. It doubles the values of numerical data types within the dictionary, including those\n within lists and those in strings (which are extracted using regex), but the function does not\n process nested dictionaries. Finally, it returns the DataFrame with numerical values stored as\n floats and other types left as-is, or an empty DataFrame if the input JSON string is empty or\n does not contain any valid data structures for DataFrame conversion.\n\n Parameters:\n json_str (str): The JSON string.\n\n Returns:\n DataFrame: A pandas DataFrame created from the dictionary.\n\n Requirements:\n - pandas\n - json\n - re\n\n Example:\n >>> json_str = '{\"a\": [1, 2, 3], \"b\": 4.9, \"c\": \"5\"}'\n >>> df = f_360(json_str)\n >>> type(df)\n \n >>> print(df)\n a b c\n 0 2 9.8 10\n 1 4 9.8 10\n 2 6 9.8 10\n \"\"\"", "canonical_solution": " NUMBERS = re.compile(r\"^-?\\d+(?:\\.\\d+)?$\")\n\n my_dict = json.loads(json_str)\n\n if not my_dict:\n return pd.DataFrame()\n\n for key, value in my_dict.items():\n if isinstance(value, list):\n my_dict[key] = [v * 2 if isinstance(v, (int, float)) else v for v in value]\n elif isinstance(value, (int, float)):\n my_dict[key] = value * 2\n elif isinstance(value, str) and NUMBERS.match(value):\n try:\n my_dict[key] = int(value) * 2\n except ValueError:\n my_dict[key] = float(value) * 2\n\n if all(not isinstance(v, list) for v in my_dict.values()):\n df = pd.DataFrame([my_dict])\n else:\n df = pd.DataFrame(my_dict)\n\n for col in df.columns:\n converted_col = pd.to_numeric(df[col], errors=\"coerce\")\n if not converted_col.isnull().any():\n df[col] = converted_col\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n json_str = '{\"a\": [1, 2, 3], \"b\": 4.9, \"c\": \"5\"}'\n expected_output = pd.DataFrame(\n {\"a\": [2, 4, 6], \"b\": [9.8, 9.8, 9.8], \"c\": [10, 10, 10]}\n )\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)\n def test_case_2(self):\n json_str = \"{}\"\n expected_output = pd.DataFrame()\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)\n def test_case_3(self):\n json_str = '{\"a\": [1, \"apple\", 3], \"b\": 4.9, \"c\": \"5\", \"d\": \"banana\"}'\n expected_output = pd.DataFrame(\n {\n \"a\": [2, \"apple\", 6],\n \"b\": [9.8, 9.8, 9.8],\n \"c\": [10, 10, 10],\n \"d\": [\"banana\", \"banana\", \"banana\"],\n }\n )\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)\n def test_case_4(self):\n json_str = '{\"a\": \"1\", \"b\": \"2.5\", \"c\": \"string\"}'\n expected_output = pd.DataFrame({\"a\": [2], \"b\": [5.0], \"c\": [\"string\"]})\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)\n def test_case_5(self):\n json_str = '{\"a\": [1, 2, {\"b\": 3}], \"c\": 4.9}'\n expected_output = pd.DataFrame({\"a\": [2, 4, {\"b\": 3}], \"c\": [9.8, 9.8, 9.8]})\n pd.testing.assert_frame_equal(f_360(json_str), expected_output)", "apis": ["pandas.DataFrame", "re.compile", "json.loads", "pandas.to_numeric"], "libs": ["re", "json", "pandas"], "doc": {"description": ["Load a JSON string into a dictionary, normalize the dictionary by doubling the numerical values,", "and then create a Pandas DataFrame from the dictionary.", "This function processes a JSON string by converting it into a dictionary, normalizes the data", "by doubling the numerical values, and then constructs a Pandas DataFrame from this dictionary."], "note": ["the function is designed to handle simple flat dictionaries, with values that are either", "single numerical values, lists of numerical values, or strings that can be interpreted as", "numbers. It doubles the values of numerical data types within the dictionary, including those", "within lists and those in strings (which are extracted using regex), but the function does not", "process nested dictionaries. Finally, it returns the DataFrame with numerical values stored as", "floats and other types left as-is, or an empty DataFrame if the input JSON string is empty or", "does not contain any valid data structures for DataFrame conversion."], "params": ["json_str (str): The JSON string."], "returns": ["DataFrame: A pandas DataFrame created from the dictionary."], "reqs": ["pandas", "json", "re"], "raises": [], "example": [">>> json_str = '{\"a\": [1, 2, 3], \"b\": 4.9, \"c\": \"5\"}'", ">>> df = f_360(json_str)", ">>> type(df)", "", ">>> print(df)", "a b c", "0 2 9.8 10", "1 4 9.8 10", "2 6 9.8 10"]}} +{"task_id": "f_411", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_411(data):\n \"\"\"\n Combine a list of dictionaries with the same keys into a single dictionary, turn it into a\n Pandas DataFrame and create a line plot of the data.\n\n Parameters:\n data (list): A list of dictionaries. The keys are labels and the values are data points.\n\n Returns:\n matplotlib.axes._subplots.Axes or None: Axes object of the plot showing 'Data over Time',\n with 'Time' on the x-axis and 'Data Points' on the y-axis.\n If data is empty, return None.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_411([{'A': 10, 'B': 15, 'C': 12},\\\n {'A': 12, 'B': 20, 'C': 14},\\\n {'A': 15, 'B': 18, 'C': 15},\\\n {'A': 11, 'B': 17, 'C': 13}])\n >>> type(ax)\n \n >>> ax.get_title()\n 'Data over Time'\n >>> len(ax.lines)\n 3\n \"\"\"", "canonical_solution": " if not data:\n return None\n df = pd.DataFrame(data)\n plt.figure()\n for label in df.columns:\n plt.plot(df[label], label=label)\n plt.xlabel(\"Time\")\n plt.ylabel(\"Data Points\")\n plt.title(\"Data over Time\")\n return plt.gca()", "test": "import unittest\nimport matplotlib\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.data1 = [\n {\"A\": 10, \"B\": 15, \"C\": 12},\n {\"A\": 12, \"B\": 20, \"C\": 14},\n {\"A\": 15, \"B\": 18, \"C\": 15},\n {\"A\": 11, \"B\": 17, \"C\": 13},\n ]\n cls.data2 = [\n {\"X\": 5, \"Y\": 8},\n {\"X\": 6, \"Y\": 7},\n {\"X\": 7, \"Y\": 6},\n {\"X\": 8, \"Y\": 5},\n ]\n cls.data3 = [{\"P\": 3, \"Q\": 2, \"R\": 4, \"S\": 1}, {\"P\": 4, \"Q\": 3, \"R\": 2, \"S\": 3}]\n cls.data4 = [{\"W\": 7}, {\"W\": 8}, {\"W\": 9}, {\"W\": 6}]\n cls.data5 = [{\"M\": 1, \"N\": 3}, {\"M\": 3, \"N\": 1}]\n def test_case_1(self):\n # Test for correct Axes instance and labels for a typical data set\n ax = f_411(self.data1)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(ax.get_title(), \"Data over Time\")\n self.assertEqual(ax.get_xlabel(), \"Time\")\n self.assertEqual(ax.get_ylabel(), \"Data Points\")\n self.assertEqual(len(ax.lines), 3)\n def test_case_2(self):\n # Test for different keys across dictionaries in data list\n data = [{\"A\": 1, \"B\": 2}, {\"B\": 3, \"C\": 4}, {\"A\": 5, \"C\": 6}]\n ax = f_411(data)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertTrue(len(ax.lines) > 0)\n def test_case_3(self):\n # Test with empty data list\n self.assertIsNone(f_411([]))\n def test_case_4(self):\n # Test with data containing non-numeric values\n data = [{\"A\": \"text\", \"B\": \"more text\"}, {\"A\": 1, \"B\": 2}]\n with self.assertRaises(TypeError):\n f_411(data)\n def test_case_5(self):\n # Test with a single entry in the data list\n data = [{\"A\": 1, \"B\": 2}]\n ax = f_411(data)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(len(ax.lines), 2)\n def test_case_6(self):\n # Test focusing on data processing correctness\n data = [\n {\"A\": 10, \"B\": 15, \"C\": 12},\n {\"A\": 12, \"B\": 20, \"C\": 14},\n {\"A\": 15, \"B\": 18, \"C\": 15},\n {\"A\": 11, \"B\": 17, \"C\": 13},\n ]\n ax = f_411(data)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n # Convert input data to DataFrame for easy comparison\n input_df = pd.DataFrame(data)\n # Iterate through each line in the plot and check against the input data\n for line in ax.lines:\n label = line.get_label()\n _, y_data = line.get_data()\n expected_y_data = input_df[label].values\n # Use numpy to compare the y_data from plot and expected data from input\n np.testing.assert_array_equal(\n y_data, expected_y_data, err_msg=f\"Data mismatch for label {label}\"\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.figure", "pandas.DataFrame", "matplotlib.pyplot.title", "matplotlib.pyplot.plot", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "matplotlib.pyplot.xlabel"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Combine a list of dictionaries with the same keys into a single dictionary, turn it into a", "Pandas DataFrame and create a line plot of the data."], "note": [], "params": ["data (list): A list of dictionaries. The keys are labels and the values are data points."], "returns": ["matplotlib.axes._subplots.Axes or None: Axes object of the plot showing 'Data over Time',", "with 'Time' on the x-axis and 'Data Points' on the y-axis.", "If data is empty, return None."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_411([{'A': 10, 'B': 15, 'C': 12},\\", "{'A': 12, 'B': 20, 'C': 14},\\", "{'A': 15, 'B': 18, 'C': 15},\\", "{'A': 11, 'B': 17, 'C': 13}])", ">>> type(ax)", "", ">>> ax.get_title()", "'Data over Time'", ">>> len(ax.lines)", "3"]}} +{"task_id": "f_379", "prompt": "import pandas as pd\nimport random\nimport re\n\n\ndef f_379(data_list, seed=42):\n \"\"\"\n Randomizes the order of comma-separated substrings within each string in a list,\n normalizing spaces to ensure a single space follows each comma using regex, then\n returns a DataFrame comparing original and randomized strings.\n\n Parameters:\n data_list (list of str): List of strings with substrings to be randomized.\n seed (int, optional): Seed for random number generator for reproducibility. Defaults to None.\n\n Returns:\n pandas.DataFrame: A DataFrame with columns 'Original String' and 'Randomized String'.\n\n Requirements:\n - pandas\n - random\n - re\n\n Example:\n >>> df = f_379(['lamp, bag, mirror', 'table, chair, bag'], seed=42)\n >>> df['Original String'][0]\n 'lamp, bag, mirror'\n >>> df['Randomized String'][0]\n 'mirror, lamp, bag'\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n df = pd.DataFrame(data_list, columns=[\"Original String\"])\n\n randomized_strings = []\n for s in data_list:\n substrings = re.split(\"\\s*,\\s*\", s)\n random_positions = random.sample(range(len(substrings)), len(substrings))\n randomized_s = \", \".join([substrings[i] for i in random_positions])\n randomized_strings.append(randomized_s)\n\n df[\"Randomized String\"] = randomized_strings\n\n return df", "test": "import unittest\nimport pandas as pd\nimport re\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic functionality with a reproducible seed\n input_data = [\"a, b\", \"c, d, e\"]\n df = f_379(input_data, seed=42)\n self.assertEqual(len(df), 2)\n self.assertListEqual(df[\"Original String\"].tolist(), input_data)\n self.assertNotEqual(\n df[\"Original String\"].tolist(), df[\"Randomized String\"].tolist()\n )\n self.assertSetEqual(\n set(df[\"Original String\"].tolist()[0].split(\", \")),\n set(df[\"Randomized String\"].tolist()[0].split(\", \")),\n )\n def test_case_2(self):\n # Test function's behavior with an empty input list\n input_data = []\n df = f_379(input_data)\n self.assertEqual(len(df), 0)\n def test_case_3(self):\n # Test with single items (no commas) to verify output matches input exactly\n input_data = [\"a\", \"b\", \"c\"]\n df = f_379(input_data)\n self.assertListEqual(\n df[\"Original String\"].tolist(), df[\"Randomized String\"].tolist()\n )\n def test_case_4(self):\n # Test with strings containing only commas\n input_data = [\",,,\", \",,\"]\n expected_output = [\", , , \", \", , \"]\n df = f_379(input_data)\n self.assertTrue(\n all(df[\"Randomized String\"].apply(lambda x: x in expected_output))\n )\n def test_case_5(self):\n # Test strings with inconsistent use of spaces and delimiters\n input_data = [\"a,b, c\", \"d ,e, f\"] # Inputs with inconsistent spacing\n df = f_379(input_data, seed=24)\n for i in range(len(input_data)):\n original_substrings = set(re.split(\"\\s*,\\s*\", input_data[i]))\n randomized_substrings = set(df[\"Randomized String\"].iloc[i].split(\", \"))\n self.assertEqual(\n original_substrings,\n randomized_substrings,\n )\n def test_case_6(self):\n # Test with strings that include special characters\n input_data = [\"!@#, $%^\", \"&*(), )(_+\"]\n df = f_379(input_data, seed=99)\n self.assertEqual(len(df), 2)\n for orig, rand in zip(df[\"Original String\"], df[\"Randomized String\"]):\n self.assertSetEqual(set(orig.split(\", \")), set(rand.split(\", \")))\n def test_case_7(self):\n # Test random seed\n input_data = [\"lamp, bag, mirror\", \"table, chair, vase\"]\n df1 = f_379(input_data, seed=42)\n df2 = f_379(input_data, seed=42)\n self.assertListEqual(\n df1[\"Randomized String\"].tolist(), df2[\"Randomized String\"].tolist()\n )\n def test_case_8(self):\n # Test the handling of non-standard separators\n input_data = [\"a;b;c\", \"d:e:f\"]\n df = f_379(input_data)\n self.assertListEqual(\n df[\"Original String\"].tolist(), df[\"Randomized String\"].tolist()\n )\n def test_case_9(self):\n ## Test handling of strings with commas not followed by spaces\n input_data = [\"a,b,c\", \"d,e,f\"]\n df = f_379(input_data, seed=42)\n for idx in range(len(input_data)):\n original_substrings = set(re.split(\",\\s*\", input_data[idx].strip()))\n randomized_substrings = set(df[\"Randomized String\"].iloc[idx].split(\", \"))\n self.assertEqual(\n original_substrings,\n randomized_substrings,\n \"Substrings should be preserved and normalized after randomization.\",\n )\n def test_case_10(self):\n # Test handling of strings with leading or trailing spaces\n input_data = [\" a, b, c \", \" d, e, f \"]\n df = f_379(input_data, seed=42)\n for idx in range(len(input_data)):\n original_substrings = set(\n x.strip() for x in re.split(\",\\s*\", input_data[idx].strip())\n )\n randomized_substrings = set(\n x.strip() for x in df[\"Randomized String\"].iloc[idx].split(\", \")\n )\n self.assertEqual(\n original_substrings,\n randomized_substrings,\n \"Ensure substrings match after randomization, ignoring leading/trailing spaces.\",\n )\n def test_case_11(self):\n # Test handling of strings with multiple spaces after a comma\n input_data = [\"a, b, c\", \"d, e, f\"]\n df = f_379(input_data, seed=42)\n for rand_str in df[\"Randomized String\"].tolist():\n self.assertTrue(\n \", \" not in rand_str\n and \", \" not in rand_str\n and \", \" not in rand_str,\n \"Multiple spaces after commas should not appear in output.\",\n )", "apis": ["pandas.DataFrame", "re.split", "random.seed", "random.sample"], "libs": ["random", "re", "pandas"], "doc": {"description": ["Randomizes the order of comma-separated substrings within each string in a list,", "normalizing spaces to ensure a single space follows each comma using regex, then", "returns a DataFrame comparing original and randomized strings."], "note": [], "params": ["data_list (list of str): List of strings with substrings to be randomized.", "seed (int, optional): Seed for random number generator for reproducibility. Defaults to None."], "returns": ["pandas.DataFrame: A DataFrame with columns 'Original String' and 'Randomized String'."], "reqs": ["pandas", "random", "re"], "raises": [], "example": [">>> df = f_379(['lamp, bag, mirror', 'table, chair, bag'], seed=42)", ">>> df['Original String'][0]", "'lamp, bag, mirror'", ">>> df['Randomized String'][0]", "'mirror, lamp, bag'"]}} +{"task_id": "f_824", "prompt": "import pandas as pd\nfrom datetime import datetime\nimport random\n\n\ndef f_824(start_date, end_date, num_series, seed=None):\n \"\"\"\n Generates a DataFrame with multiple random integer time series (each ranging\n from 0 to 100) from a start date to an end date, then returns the generated time series\n on a line plot.\n\n Parameters:\n - start_date (str): The start date in \"yyyy-mm-dd\" format.\n - end_date (str): The end date in \"yyyy-mm-dd\" format.\n - num_series (int): The number of random time series to generate.\n - seed (int, optional): Seed for the random number generator. Defaults to None (not set).\n\n Returns:\n - pandas.DataFrame: A pandas DataFrame containing the generated time series, indexed by date.\n - plt.Axes: A matplotlib line plot of the time series.\n\n Raises:\n - ValueError: If start_date is later than end_date; or if num_series is less than 1.\n\n Requirements:\n - pandas\n - datetime\n - random\n\n Notes:\n - The line plot's title is set to \"Random Time Series\", the x-axis label to \"Date\",\n and the y-axis label to \"Value\".\n - Each time series is plotted as a separate line with automatic coloring and legend\n entry labeled as \"series_x\" where x is the series number.\n\n Example:\n >>> df, ax = f_824('2020-01-01', '2020-12-31', 3, 42)\n >>> df.head(2)\n series_1 series_2 series_3\n 2020-01-01 81 67 19\n 2020-01-02 14 20 29\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n start_date_dt = datetime.strptime(start_date, \"%Y-%m-%d\")\n end_date_dt = datetime.strptime(end_date, \"%Y-%m-%d\")\n if start_date_dt > end_date_dt:\n raise ValueError(\"start_date must be earlier than or equal to end_date.\")\n if num_series < 1:\n raise ValueError(\"num_series must be at least 1.\")\n\n date_range = pd.date_range(start_date_dt, end_date_dt)\n\n data = {}\n for i in range(num_series):\n series_name = f\"series_{i+1}\"\n data[series_name] = [random.randint(0, 100) for _ in range(len(date_range))]\n\n df = pd.DataFrame(data, index=date_range)\n\n ax = df.plot()\n ax.set_title(\"Random Time Series\")\n ax.set_xlabel(\"Date\")\n ax.set_ylabel(\"Value\")\n\n return df, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib\nimport warnings\nclass TestCases(unittest.TestCase):\n def test_valid_input(self):\n \"\"\"Tests correct DataFrame structure and plot type with valid inputs.\"\"\"\n df, ax = f_824(\"2022-01-01\", \"2022-01-10\", 2, seed=42)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.shape[1], 2)\n self.assertEqual(len(df.index), 10)\n self.assertIsInstance(ax, matplotlib.axes._axes.Axes)\n self.assertTrue((df <= 100).all().all() and (df >= 0).all().all())\n def test_seed_reproducibility(self):\n \"\"\"Tests if providing a seed results in reproducible outputs.\"\"\"\n df1, _ = f_824(\"2022-01-01\", \"2022-01-05\", 1, seed=42)\n df2, _ = f_824(\"2022-01-01\", \"2022-01-05\", 1, seed=42)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertTrue((df1 <= 100).all().all() and (df1 >= 0).all().all())\n def test_negative_num_series(self):\n \"\"\"Tests if function raises an error when num_series is less than 1.\"\"\"\n with self.assertRaises(ValueError):\n f_824(\"2022-01-01\", \"2022-01-10\", 0)\n def test_start_date_after_end_date(self):\n \"\"\"Tests if function raises an error when start date is after end date.\"\"\"\n with self.assertRaises(ValueError):\n f_824(\"2022-01-10\", \"2022-01-01\", 1)\n def test_single_day_series(self):\n \"\"\"Tests DataFrame structure and plot type when start and end dates are the same.\"\"\"\n with warnings.catch_warnings():\n warnings.simplefilter(\"ignore\", category=UserWarning)\n df, ax = f_824(\"2022-07-01\", \"2022-07-01\", 1, seed=42)\n self.assertEqual(len(df.index), 1)\n self.assertIsInstance(ax, matplotlib.axes._axes.Axes)\n self.assertTrue((df <= 100).all().all() and (df >= 0).all().all())\n def test_multiple_series_names(self):\n \"\"\"Tests if the generated DataFrame contains correct series names.\"\"\"\n df, _ = f_824(\"2022-01-01\", \"2022-01-05\", 3, seed=42)\n expected_columns = [\"series_1\", \"series_2\", \"series_3\"]\n self.assertListEqual(list(df.columns), expected_columns)\n self.assertTrue((df <= 100).all().all() and (df >= 0).all().all())\n def test_plot_attributes(self):\n \"\"\"Tests the attributes of the plot, including title, x-label, and y-label.\"\"\"\n _, ax = f_824(\"2022-01-01\", \"2022-01-05\", 2, seed=42)\n self.assertEqual(ax.get_title(), \"Random Time Series\")\n self.assertEqual(ax.get_xlabel(), \"Date\")\n self.assertEqual(ax.get_ylabel(), \"Value\")\n self.assertTrue(len(ax.lines) == 2)", "apis": ["pandas.DataFrame", "random.randint", "pandas.date_range", "datetime.datetime.strptime", "random.seed"], "libs": ["random", "pandas", "datetime"], "doc": {"description": ["Generates a DataFrame with multiple random integer time series (each ranging", "from 0 to 100) from a start date to an end date, then returns the generated time series", "on a line plot.", "Notes:", "- The line plot's title is set to \"Random Time Series\", the x-axis label to \"Date\",", "and the y-axis label to \"Value\".", "- Each time series is plotted as a separate line with automatic coloring and legend", "entry labeled as \"series_x\" where x is the series number."], "note": [], "params": ["start_date (str): The start date in \"yyyy-mm-dd\" format.", "end_date (str): The end date in \"yyyy-mm-dd\" format.", "num_series (int): The number of random time series to generate.", "seed (int, optional): Seed for the random number generator. Defaults to None (not set)."], "returns": ["pandas.DataFrame: A pandas DataFrame containing the generated time series, indexed by date.", "plt.Axes: A matplotlib line plot of the time series."], "reqs": ["pandas", "datetime", "random"], "raises": ["ValueError: If start_date is later than end_date; or if num_series is less than 1."], "example": [">>> df, ax = f_824('2020-01-01', '2020-12-31', 3, 42)", ">>> df.head(2)", "series_1 series_2 series_3", "2020-01-01 81 67 19", "2020-01-02 14 20 29"]}} +{"task_id": "f_406", "prompt": "import pandas as pd\nfrom scipy.spatial.distance import pdist, squareform\n\n\ndef f_406(array):\n \"\"\"\n Generate a Pandas DataFrame from a 2D list and calculate a distance matrix.\n\n This function converts a 2D list into a DataFrame, with columns named alphabetically starting from 'A'.\n It uses the `chr()` function, which converts an integer to its corresponding Unicode character,\n to dynamically assign alphabetical labels to each column based on their index. The function then\n computes the Euclidean distance matrix between rows.\n\n Parameters:\n array (list of list of int): The 2D list representing the data.\n Each sublist must contain only integers or floats. If the input does not\n conform to this structure, a TypeError is raised.\n\n Returns:\n - df (pd.DataFrame): data converted from 2D list.\n - distance_matrix (pd.DataFrame): output distance matrix.\n\n Requirements:\n - pandas\n - scipy.spatial.distance.pdist\n - scipy.spatial.distance.squareform\n\n Example:\n >>> df, distance_matrix = f_406([[1,2,3,4,5], [6,7,8,9,10]])\n >>> print(df)\n A B C D E\n 0 1 2 3 4 5\n 1 6 7 8 9 10\n >>> print(distance_matrix)\n 0 1\n 0 0.00000 11.18034\n 1 11.18034 0.00000\n \"\"\"", "canonical_solution": " if not isinstance(array, list):\n raise TypeError(\"Input must be a list.\")\n\n if not all(isinstance(sublist, list) for sublist in array):\n raise TypeError(\"Input must be a list of lists.\")\n\n for sublist in array:\n if not all(isinstance(item, (int, float)) for item in sublist):\n raise TypeError(\"All elements in the sublists must be int or float.\")\n\n columns = [chr(65 + i) for i in range(len(array[0]))]\n df = pd.DataFrame(array, columns=columns)\n\n distances = pdist(df.values, metric=\"euclidean\")\n distance_matrix = pd.DataFrame(\n squareform(distances), index=df.index, columns=df.index\n )\n\n return df, distance_matrix", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Teset basic case\n input_data = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (2, 5))\n self.assertTrue((df.columns == [\"A\", \"B\", \"C\", \"D\", \"E\"]).all())\n self.assertEqual(distance_matrix.shape, (2, 2))\n self.assertAlmostEqual(distance_matrix.iloc[0, 1], 11.18034, places=5)\n self.assertAlmostEqual(distance_matrix.iloc[1, 0], 11.18034, places=5)\n def test_case_2(self):\n # Test negatives and zero\n input_data = [[-5, -4, -3, -2, -1], [0, 0, 0, 0, 0], [1, 2, 3, 4, 5]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (3, 5))\n self.assertEqual(distance_matrix.shape, (3, 3))\n self.assertAlmostEqual(distance_matrix.iloc[0, 1], 7.41620, places=5)\n self.assertAlmostEqual(distance_matrix.iloc[1, 2], 7.41620, places=5)\n def test_case_3(self):\n # Test small lists\n input_data = [[1, 2], [3, 4]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (2, 2))\n self.assertEqual(distance_matrix.shape, (2, 2))\n self.assertAlmostEqual(distance_matrix.iloc[0, 1], 2.82843, places=5)\n def test_case_4(self):\n # Test repeated single element\n input_data = [[5, 5, 5], [5, 5, 5], [5, 5, 5]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (3, 3))\n self.assertEqual(distance_matrix.shape, (3, 3))\n self.assertEqual(distance_matrix.iloc[0, 1], 0)\n self.assertEqual(distance_matrix.iloc[1, 2], 0)\n def test_case_5(self):\n # Test single list\n input_data = [[1, 2, 3, 4, 5]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (1, 5))\n self.assertEqual(distance_matrix.shape, (1, 1))\n self.assertEqual(distance_matrix.iloc[0, 0], 0)\n def test_case_6(self):\n # Test empty list\n input_data = []\n with self.assertRaises(IndexError):\n f_406(input_data)\n def test_case_7(self):\n # Test larger dataset\n input_data = [list(range(100)) for _ in range(50)]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (50, 100))\n self.assertEqual(distance_matrix.shape, (50, 50))\n # No specific values check due to complexity\n def test_case_8(self):\n # Test single element list\n input_data = [[1]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (1, 1))\n self.assertEqual(distance_matrix.shape, (1, 1))\n self.assertEqual(distance_matrix.iloc[0, 0], 0)\n def test_case_9(self):\n # Test with different types in list\n input_data = [[1, 2, 3], [\"a\", \"b\", \"c\"]]\n with self.assertRaises(TypeError):\n f_406(input_data)\n def test_case_10(self):\n # Test with a more complex numerical list (including floats and negatives)\n input_data = [[-1.5, 2.3, 4.5], [0, 0, 0], [5.5, -2.3, 3.1]]\n df, distance_matrix = f_406(input_data)\n self.assertEqual(df.shape, (3, 3))\n self.assertEqual(distance_matrix.shape, (3, 3))\n # Define expected distances based on manual or precise calculation\n expected_distances = [\n [0.0, 5.27162, 8.49235],\n [5.27162, 0.0, 6.71937],\n [8.49235, 6.71937, 0.0],\n ]\n # Assert each calculated distance matches the expected value\n for i in range(len(expected_distances)):\n for j in range(len(expected_distances[i])):\n self.assertAlmostEqual(\n distance_matrix.iloc[i, j], expected_distances[i][j], places=5\n )", "apis": ["pandas.DataFrame", "scipy.spatial.distance.squareform", "scipy.spatial.distance.pdist"], "libs": ["pandas", "scipy"], "doc": {"description": ["Generate a Pandas DataFrame from a 2D list and calculate a distance matrix.", "This function converts a 2D list into a DataFrame, with columns named alphabetically starting from 'A'.", "It uses the `chr()` function, which converts an integer to its corresponding Unicode character,", "to dynamically assign alphabetical labels to each column based on their index. The function then", "computes the Euclidean distance matrix between rows."], "note": [], "params": ["array (list of list of int): The 2D list representing the data.", "Each sublist must contain only integers or floats. If the input does not", "conform to this structure, a TypeError is raised."], "returns": ["df (pd.DataFrame): data converted from 2D list.", "distance_matrix (pd.DataFrame): output distance matrix."], "reqs": ["pandas", "scipy.spatial.distance.pdist", "scipy.spatial.distance.squareform"], "raises": [], "example": [">>> df, distance_matrix = f_406([[1,2,3,4,5], [6,7,8,9,10]])", ">>> print(df)", "A B C D E", "0 1 2 3 4 5", "1 6 7 8 9 10", ">>> print(distance_matrix)", "0 1", "0 0.00000 11.18034", "1 11.18034 0.00000"]}} +{"task_id": "f_888", "prompt": "import pandas as pd\nfrom datetime import datetime\nimport matplotlib.pyplot as plt\n\n# Constants\nROOMS = [\"Room1\", \"Room2\", \"Room3\", \"Room4\", \"Room5\"]\n\ndef f_888(date_str, booking_data):\n \"\"\"\n This function generates a status report of room bookings for a specified date\n and displays a bar plot representing the booking statuses of various rooms.\n It validates the provided date, compiles a booking status report, and visualizes\n the data in a bar plot.\n\n Parameters:\n - date_str (str): The date for which the booking status needs to be checked,\n in \"yyyy-mm-dd\" format. The function validates this date.\n - booking_data (dict): A dictionary with room names as keys and booking statuses\n as values. The keys should match the rooms listed in the ROOMS constant.\n\n Returns:\n - DataFrame: A pandas DataFrame containing booking status for each room.\n - plt.Axes: A matplotlib Axes object for the bar plot of booking statuses.\n\n Raises:\n - ValueError: Raised in two scenarios:\n 1. If `date_str` does not follow the \"yyyy-mm-dd\" format or is not a valid date.\n 2. If `date_str` refers to a past date.\n\n Requirements:\n - pandas\n - datetime\n\n Example:\n >>> from datetime import datetime\n >>> future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n >>> booking_info = {\"Room1\": \"Booked\", \"Room2\": \"Available\"}\n >>> report_df, ax = f_888(future_date, booking_info)\n >>> print(report_df)\n Room Booking Status\n 0 Room1 Booked\n 1 Room2 Available\n 2 Room3 Not Listed\n 3 Room4 Not Listed\n 4 Room5 Not Listed\n \"\"\"", "canonical_solution": " # Validate the date string\n try:\n date = datetime.strptime(date_str, \"%Y-%m-%d\")\n if date < datetime.now():\n raise ValueError(\"Date is in the past. Please provide a future date.\")\n except ValueError as e:\n raise ValueError(f\"Invalid date: {e}\") from e\n\n report_data = [[room, booking_data.get(room, \"Not Listed\")] for room in ROOMS]\n report_df = pd.DataFrame(report_data, columns=[\"Room\", \"Booking Status\"])\n\n # Create a bar plot of the booking statuses\n ax = (\n report_df[\"Booking Status\"]\n .value_counts()\n .plot(kind=\"bar\", title=\"Booking Statuses for \" + date_str)\n )\n\n return report_df, ax", "test": "import unittest\nimport pandas as pd\nfrom datetime import datetime, timedelta\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_888\"\"\"\n def test_future_date_valid_booking_data(self):\n \"\"\"\n Test f_888 with a future date and valid booking data.\n \"\"\"\n future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n booking_data = {\"Room1\": \"Booked\", \"Room2\": \"Available\"}\n report_df, _ = f_888(future_date, booking_data)\n self.assertIn(\"Room1\", report_df[\"Room\"].values)\n self.assertIn(\"Booked\", report_df[\"Booking Status\"].values)\n def test_past_date(self):\n \"\"\"\n Test f_888 with a past date to ensure it raises a ValueError.\n \"\"\"\n past_date = \"2020-01-01\"\n booking_data = {\"Room1\": \"Booked\"}\n with self.assertRaises(ValueError):\n f_888(past_date, booking_data)\n def test_invalid_date_format(self):\n \"\"\"\n Test f_888 with an invalid date format to check for ValueError.\n \"\"\"\n invalid_date = \"15-06-2023\"\n booking_data = {\"Room1\": \"Booked\"}\n with self.assertRaises(ValueError):\n f_888(invalid_date, booking_data)\n def test_booking_data_for_nonexistent_room(self):\n \"\"\"\n Test f_888 with booking data for a room not in the ROOMS constant.\n \"\"\"\n future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n booking_data = {\"Room6\": \"Booked\"}\n report_df, _ = f_888(future_date, booking_data)\n self.assertIn(\"Not Listed\", report_df[\"Booking Status\"].values)\n def test_no_booking_data(self):\n \"\"\"\n Test f_888 with no booking data provided.\n \"\"\"\n future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n booking_data = {}\n report_df, _ = f_888(future_date, booking_data)\n self.assertTrue((report_df[\"Booking Status\"] == \"Not Listed\").all())\n def tearDown(self):\n plt.clf()", "apis": ["pandas.DataFrame", "datetime.datetime.now", "datetime.datetime.strptime"], "libs": ["pandas", "datetime"], "doc": {"description": ["This function generates a status report of room bookings for a specified date", "and displays a bar plot representing the booking statuses of various rooms.", "It validates the provided date, compiles a booking status report, and visualizes", "the data in a bar plot."], "note": [], "params": ["date_str (str): The date for which the booking status needs to be checked,", "in \"yyyy-mm-dd\" format. The function validates this date.", "booking_data (dict): A dictionary with room names as keys and booking statuses", "as values. The keys should match the rooms listed in the ROOMS constant."], "returns": ["DataFrame: A pandas DataFrame containing booking status for each room.", "plt.Axes: A matplotlib Axes object for the bar plot of booking statuses."], "reqs": ["pandas", "datetime"], "raises": ["ValueError: Raised in two scenarios:", "1. If `date_str` does not follow the \"yyyy-mm-dd\" format or is not a valid date.", "2. If `date_str` refers to a past date."], "example": [">>> from datetime import datetime", ">>> future_date = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")", ">>> booking_info = {\"Room1\": \"Booked\", \"Room2\": \"Available\"}", ">>> report_df, ax = f_888(future_date, booking_info)", ">>> print(report_df)", "Room Booking Status", "0 Room1 Booked", "1 Room2 Available", "2 Room3 Not Listed", "3 Room4 Not Listed", "4 Room5 Not Listed"]}} +{"task_id": "f_892", "prompt": "from datetime import datetime\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_892(date_str):\n \"\"\"\n Plot a sine wave whose frequency is determined by the day of the month from the given date.\n\n Parameters:\n date_str (str): A date in \"yyyy-mm-dd\" format, used to determine the frequency of the sine wave.\n\n Returns:\n matplotlib.axes.Axes: An Axes object containing the plotted sine wave.\n\n Requirements:\n - datetime.datetime\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_892('2023-06-15')\n >>> print(ax.get_title())\n Sine Wave for 2023-06-15 (Frequency: 15)\n \"\"\"", "canonical_solution": " date = datetime.strptime(date_str, \"%Y-%m-%d\")\n x = np.linspace(0, 2 * np.pi, 1000)\n frequency = date.day\n y = np.sin(frequency * x)\n _, ax = plt.subplots()\n ax.plot(x, y)\n ax.set_title(f\"Sine Wave for {date_str} (Frequency: {frequency})\")\n return ax", "test": "import unittest\nimport matplotlib\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_892.\"\"\"\n def test_valid_date(self):\n \"\"\"\n Test with a valid date string to ensure the function returns a matplotlib Axes object.\n \"\"\"\n result = f_892(\"2023-06-15\")\n self.assertIsInstance(result, matplotlib.axes.Axes)\n def test_leap_year_date(self):\n \"\"\"\n Test with a date from a leap year to check the function's handling of leap years.\n \"\"\"\n result = f_892(\"2024-02-29\")\n self.assertIsInstance(result, matplotlib.axes.Axes)\n def test_beginning_of_month(self):\n \"\"\"\n Test with a date at the beginning of the month (low-frequency wave).\n \"\"\"\n result = f_892(\"2023-01-01\")\n self.assertIsInstance(result, matplotlib.axes.Axes)\n def test_end_of_month(self):\n \"\"\"\n Test with a date towards the end of the month (high-frequency wave).\n \"\"\"\n result = f_892(\"2023-01-31\")\n self.assertIsInstance(result, matplotlib.axes.Axes)\n def test_invalid_date_format(self):\n \"\"\"\n Test with an invalid date format to check if the function raises a ValueError.\n \"\"\"\n with self.assertRaises(ValueError):\n f_892(\"15-06-2023\")\n def tearDown(self):\n plt.close()", "apis": ["numpy.sin", "numpy.pi", "numpy.linspace", "datetime.datetime.strptime", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "datetime"], "doc": {"description": ["Plot a sine wave whose frequency is determined by the day of the month from the given date."], "note": [], "params": ["date_str (str): A date in \"yyyy-mm-dd\" format, used to determine the frequency of the sine wave."], "returns": ["matplotlib.axes.Axes: An Axes object containing the plotted sine wave."], "reqs": ["datetime.datetime", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_892('2023-06-15')", ">>> print(ax.get_title())", "Sine Wave for 2023-06-15 (Frequency: 15)"]}} {"task_id": "f_763", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\n\ndef f_763(df, columns):\n \"\"\"\n Normalizes specified columns of a DataFrame using min-max scaling.\n\n Parameters:\n df (pandas.DataFrame): The DataFrame containing numerical data.\n columns (list of str): A list of column names to be normalized.\n\n Returns:\n pandas.DataFrame: A new DataFrame with the specified columns normalized between 0 and 1.\n\n Requirements:\n - pandas for DataFrame operations\n - sklearn.preprocessing for MinMaxScaler\n\n Constants:\n - A MinMaxScaler object from sklearn.preprocessing is used internally for scaling.\n\n Example:\n >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})\n >>> normalized_df = f_763(df, ['a', 'b'])\n >>> print(normalized_df)\n a b\n 0 0.0 0.0\n 1 0.5 0.5\n 2 1.0 1.0\n \"\"\"", "canonical_solution": " # Create a local MinMaxScaler object\n scaler = MinMaxScaler()\n \n # Create a copy of the DataFrame to avoid modifying the original DataFrame\n df_copy = df.copy()\n\n # Normalize the specified columns\n df_copy[columns] = scaler.fit_transform(df_copy[columns])\n\n return df_copy", "test": "import unittest\nimport pandas as pd\nfrom pandas.testing import assert_frame_equal\nfrom sklearn.preprocessing import MinMaxScaler\nimport sys\n# Import the function f_763 from the refined_function.py file\nsys.path.append('/mnt/data/')\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Input: DataFrame with two columns 'a' and 'b' with integer values\n # Output: DataFrame with 'a' and 'b' normalized\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})\n expected_df = pd.DataFrame({'a': [0.0, 0.5, 1.0], 'b': [0.0, 0.5, 1.0]})\n result_df = f_763(df, ['a', 'b'])\n assert_frame_equal(expected_df, result_df)\n def test_case_2(self):\n # Input: DataFrame with one column 'x' with float values\n # Output: DataFrame with 'x' normalized\n df = pd.DataFrame({'x': [1.1, 2.2, 3.3]})\n expected_df = pd.DataFrame({'x': [0.0, 0.5, 1.0]})\n result_df = f_763(df, ['x'])\n assert_frame_equal(expected_df, result_df)\n def test_case_3(self):\n # Input: DataFrame with multiple columns, but only one column 'y' to normalize\n # Output: DataFrame with 'y' normalized, other columns unchanged\n df = pd.DataFrame({'y': [10, 20, 30], 'z': [1, 2, 3]})\n expected_df = pd.DataFrame({'y': [0.0, 0.5, 1.0], 'z': [1, 2, 3]})\n result_df = f_763(df, ['y'])\n assert_frame_equal(expected_df, result_df)\n def test_case_4(self):\n # Input: DataFrame with negative numbers in column 'm'\n # Output: DataFrame with 'm' normalized\n df = pd.DataFrame({'m': [-1, 0, 1]})\n expected_df = pd.DataFrame({'m': [0.0, 0.5, 1.0]})\n result_df = f_763(df, ['m'])\n assert_frame_equal(expected_df, result_df)\n def test_case_5(self):\n # Input: DataFrame with all zeros in column 'n'\n # Output: DataFrame with 'n' normalized (all zeros)\n df = pd.DataFrame({'n': [0, 0, 0]})\n expected_df = pd.DataFrame({'n': [0.0, 0.0, 0.0]})\n result_df = f_763(df, ['n'])\n assert_frame_equal(expected_df, result_df)", "apis": ["sklearn.preprocessing.MinMaxScaler"], "libs": ["sklearn"], "doc": {"description": ["Normalizes specified columns of a DataFrame using min-max scaling.", "Constants:", "- A MinMaxScaler object from sklearn.preprocessing is used internally for scaling."], "note": [], "params": ["df (pandas.DataFrame): The DataFrame containing numerical data.", "columns (list of str): A list of column names to be normalized."], "returns": ["pandas.DataFrame: A new DataFrame with the specified columns normalized between 0 and 1."], "reqs": ["pandas for DataFrame operations", "sklearn.preprocessing for MinMaxScaler"], "raises": [], "example": [">>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})", ">>> normalized_df = f_763(df, ['a', 'b'])", ">>> print(normalized_df)", "a b", "0 0.0 0.0", "1 0.5 0.5", "2 1.0 1.0"]}} -{"task_id": "f_362", "prompt": "import subprocess\nimport pandas as pd\n\ndef f_362(script_path, output_file_path):\n \"\"\"\n Executes a script to produce a CSV, reads the CSV, and plots a bar graph from the data.\n\n This function runs the provided script, which should generate a CSV file at the specified output path.\n The CSV must have exactly two columns. It then reads this CSV into a DataFrame and plots a bar graph,\n setting the first column as the x-axis labels and the second column as the bar heights.\n It will raise ValueError if the script fails to execute, or if the produced CSV is not valid.\n\n Parameters:\n - script_path (str): Path to the script to be executed.\n - output_file_path (str): Path where the script outputs the CSV.\n\n Returns:\n - df (pd.DataFrame): DataFrame containing the data from the CSV.\n - ax (matplotlib.axes._axes.Axes): Axes object of the plotted bar graph.\n\n Requirements:\n - pandas\n - subprocess\n\n Examples:\n >>> df, ax = f_362(\"generate_data.sh\", \"data.csv\")\n >>> type(df)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " try:\n subprocess.run([script_path], check=True)\n except (subprocess.CalledProcessError, FileNotFoundError):\n raise ValueError(\n \"Error occurred while executing the script or script not found\"\n )\n\n df = pd.read_csv(output_file_path)\n\n if len(df.columns) != 2:\n raise ValueError(\"CSV file must contain exactly 2 columns\")\n\n ax = df.plot(kind=\"bar\", x=df.columns[0], legend=False)\n ax.set_xlabel(df.columns[0])\n\n return df, ax", "test": "import unittest\nimport os\nimport tempfile\n# import matplotlib\n# Force matplotlib to not use any Xwindows backend.\n# matplotlib.use('Agg')\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.script_path = os.path.join(self.temp_dir.name, \"script.sh\")\n self.output_path = os.path.join(self.temp_dir.name, \"output.csv\")\n self.valid_csv_content = [\n f'echo \"Name,Value\" > {self.output_path}\\n',\n f'echo \"A,1\" >> {self.output_path}\\n',\n f'echo \"B,2\" >> {self.output_path}\\n',\n f'echo \"C,3\" >> {self.output_path}\\n',\n ]\n def tearDown(self):\n self.temp_dir.cleanup()\n plt.close(\"all\")\n def _create_script(self, lines):\n with open(self.script_path, \"w\") as file:\n file.write(\"#!/bin/bash\\n\")\n file.writelines(lines)\n os.chmod(self.script_path, 0o755)\n def _validate_y_tick_labels(self, ax, df):\n plt.gcf().canvas.draw() # In older versions, need to force matplotlib to render\n y_tick_labels = [\n float(label.get_text())\n for label in ax.get_yticklabels()\n if label.get_text()\n ]\n self.assertTrue(\n all(\n y_tick_labels[i] <= y_tick_labels[i + 1]\n for i in range(len(y_tick_labels) - 1)\n ),\n \"Y-tick labels are not in increasing order\",\n )\n self.assertTrue(\n min(y_tick_labels) <= df[df.columns[1]].min() <= max(y_tick_labels)\n and min(y_tick_labels) <= df[df.columns[1]].max() <= max(y_tick_labels),\n \"Y-tick labels do not cover the range of the data\",\n )\n def test_case_1(self):\n # Test plot generation\n self._create_script(self.valid_csv_content)\n df, ax = f_362(self.script_path, self.output_path)\n expected_labels = df.iloc[:, 0].tolist()\n x_tick_labels = [tick.get_text() for tick in ax.get_xticklabels()]\n # Expected return object type\n self.assertIsInstance(ax, plt.Axes)\n # Expected number of bars\n self.assertEqual(len(ax.patches), df.shape[0])\n # x-tick labels match the first column of the DataFrame\n self.assertListEqual(x_tick_labels, expected_labels)\n self._validate_y_tick_labels(ax, df)\n def test_case_2(self):\n # Test basic csv\n expected_columns = [\"Name\", \"Value\"]\n expected_data = {\"Name\": [\"A\", \"B\", \"C\"], \"Value\": [1, 2, 3]}\n self._create_script(self.valid_csv_content)\n df, ax = f_362(self.script_path, self.output_path)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.shape, (3, 2))\n self._validate_y_tick_labels(ax, df)\n self.assertListEqual(df.columns.tolist(), expected_columns)\n for column, expected_values in expected_data.items():\n self.assertTrue(all(df[column] == expected_values))\n def test_case_3(self):\n # Test handling of script execution failure\n self._create_script([\"exit 1\\n\"])\n with self.assertRaises(ValueError):\n f_362(self.script_path, self.output_path)\n def test_case_4(self):\n # Test handling of files with too many columns\n content = [\n f'echo \"Name,Value,Extra\" > {self.output_path}\\n',\n f'echo \"A,1,Ignore\" >> {self.output_path}\\n',\n f'echo \"B,2,Ignore\" >> {self.output_path}\\n',\n ]\n self._create_script(content)\n with self.assertRaises(ValueError):\n f_362(self.script_path, self.output_path)\n def test_case_5(self):\n # Test handling of files with too few columns\n content = [\n f'echo \"Name\" > {self.output_path}\\n',\n f'echo \"A\" >> {self.output_path}\\n',\n f'echo \"B\" >> {self.output_path}\\n',\n ]\n self._create_script(content)\n with self.assertRaises(ValueError):\n f_362(self.script_path, self.output_path)\n def test_case_6(self):\n # Test handling of empty file\n content = [f\"> {self.output_path}\\n\"]\n self._create_script(content)\n with self.assertRaises(ValueError):\n f_362(self.script_path, self.output_path)\n def test_case_7(self):\n # Test handling non-numeric values\n content = [\n f'echo \"Name,Value\" > {self.output_path}\\n',\n f'echo \"A,NonNumeric\" >> {self.output_path}\\n',\n f'echo \"B,2\" >> {self.output_path}\\n',\n ]\n self._create_script(content)\n with self.assertRaises(TypeError):\n f_362(self.script_path, self.output_path)\n def test_case_8(self):\n # Test handling missing values\n content = [\n f'echo \"Name,Value\" > {self.output_path}\\n',\n f'echo \"A,\" >> {self.output_path}\\n',\n f'echo \"B,2\" >> {self.output_path}\\n',\n ]\n self._create_script(content)\n df, _ = f_362(self.script_path, self.output_path)\n self.assertTrue(df.isnull().values.any())\n self.assertEqual(df.shape, (2, 2))\n def test_case_9(self):\n # Handle handling of non-exitent script\n with self.assertRaises(ValueError):\n f_362(\n os.path.join(self.temp_dir.name, \"invalid_script_nonexist.sh\"),\n self.output_path,\n )", "apis": ["subprocess.CalledProcessError", "pandas.read_csv", "subprocess.run"], "libs": ["subprocess", "pandas"], "doc": {"description": ["Executes a script to produce a CSV, reads the CSV, and plots a bar graph from the data.", "This function runs the provided script, which should generate a CSV file at the specified output path.", "The CSV must have exactly two columns. It then reads this CSV into a DataFrame and plots a bar graph,", "setting the first column as the x-axis labels and the second column as the bar heights.", "It will raise ValueError if the script fails to execute, or if the produced CSV is not valid."], "note": [], "params": ["script_path (str): Path to the script to be executed.", "output_file_path (str): Path where the script outputs the CSV."], "returns": ["df (pd.DataFrame): DataFrame containing the data from the CSV.", "ax (matplotlib.axes._axes.Axes): Axes object of the plotted bar graph."], "reqs": ["pandas", "subprocess"], "raises": [], "example": ["Examples:", ">>> df, ax = f_362(\"generate_data.sh\", \"data.csv\")", ">>> type(df)", "", ">>> type(ax)", ""]}} +{"task_id": "f_362", "prompt": "import subprocess\nimport pandas as pd\n\ndef f_362(script_path, output_file_path):\n \"\"\"\n Executes a script to produce a CSV, reads the CSV, and plots a bar graph from the data.\n\n This function runs the provided script, which should generate a CSV file at the specified output path.\n The CSV must have exactly two columns. It then reads this CSV into a DataFrame and plots a bar graph,\n setting the first column as the x-axis labels and the second column as the bar heights.\n It will raise ValueError if the script fails to execute, or if the produced CSV is not valid.\n\n Parameters:\n - script_path (str): Path to the script to be executed.\n - output_file_path (str): Path where the script outputs the CSV.\n\n Returns:\n - df (pd.DataFrame): DataFrame containing the data from the CSV.\n - ax (matplotlib.axes._axes.Axes): Axes object of the plotted bar graph.\n\n Requirements:\n - pandas\n - subprocess\n\n Examples:\n >>> df, ax = f_362(\"generate_data.sh\", \"data.csv\")\n >>> type(df)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " try:\n subprocess.run([script_path], check=True)\n except (subprocess.CalledProcessError, FileNotFoundError):\n raise ValueError(\n \"Error occurred while executing the script or script not found\"\n )\n\n df = pd.read_csv(output_file_path)\n\n if len(df.columns) != 2:\n raise ValueError(\"CSV file must contain exactly 2 columns\")\n\n ax = df.plot(kind=\"bar\", x=df.columns[0], legend=False)\n ax.set_xlabel(df.columns[0])\n\n return df, ax", "test": "import unittest\nimport os\nimport tempfile\n# import matplotlib\n# Force matplotlib to not use any Xwindows backend.\n# matplotlib.use('Agg')\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.script_path = os.path.join(self.temp_dir.name, \"script.sh\")\n self.output_path = os.path.join(self.temp_dir.name, \"output.csv\")\n self.valid_csv_content = [\n f'echo \"Name,Value\" > {self.output_path}\\n',\n f'echo \"A,1\" >> {self.output_path}\\n',\n f'echo \"B,2\" >> {self.output_path}\\n',\n f'echo \"C,3\" >> {self.output_path}\\n',\n ]\n def tearDown(self):\n self.temp_dir.cleanup()\n plt.close(\"all\")\n def _create_script(self, lines):\n with open(self.script_path, \"w\") as file:\n file.write(\"#!/bin/bash\\n\")\n file.writelines(lines)\n os.chmod(self.script_path, 0o755)\n def _validate_y_tick_labels(self, ax, df):\n plt.gcf().canvas.draw() # In older versions, need to force matplotlib to render\n y_tick_labels = [\n float(label.get_text())\n for label in ax.get_yticklabels()\n if label.get_text()\n ]\n self.assertTrue(\n all(\n y_tick_labels[i] <= y_tick_labels[i + 1]\n for i in range(len(y_tick_labels) - 1)\n ),\n \"Y-tick labels are not in increasing order\",\n )\n self.assertTrue(\n min(y_tick_labels) <= df[df.columns[1]].min() <= max(y_tick_labels)\n and min(y_tick_labels) <= df[df.columns[1]].max() <= max(y_tick_labels),\n \"Y-tick labels do not cover the range of the data\",\n )\n def test_case_1(self):\n # Test plot generation\n self._create_script(self.valid_csv_content)\n df, ax = f_362(self.script_path, self.output_path)\n expected_labels = df.iloc[:, 0].tolist()\n x_tick_labels = [tick.get_text() for tick in ax.get_xticklabels()]\n # Expected return object type\n self.assertIsInstance(ax, plt.Axes)\n # Expected number of bars\n self.assertEqual(len(ax.patches), df.shape[0])\n # x-tick labels match the first column of the DataFrame\n self.assertListEqual(x_tick_labels, expected_labels)\n self._validate_y_tick_labels(ax, df)\n def test_case_2(self):\n # Test basic csv\n expected_columns = [\"Name\", \"Value\"]\n expected_data = {\"Name\": [\"A\", \"B\", \"C\"], \"Value\": [1, 2, 3]}\n self._create_script(self.valid_csv_content)\n df, ax = f_362(self.script_path, self.output_path)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.shape, (3, 2))\n self._validate_y_tick_labels(ax, df)\n self.assertListEqual(df.columns.tolist(), expected_columns)\n for column, expected_values in expected_data.items():\n self.assertTrue(all(df[column] == expected_values))\n def test_case_3(self):\n # Test handling of script execution failure\n self._create_script([\"exit 1\\n\"])\n with self.assertRaises(ValueError):\n f_362(self.script_path, self.output_path)\n def test_case_4(self):\n # Test handling of files with too many columns\n content = [\n f'echo \"Name,Value,Extra\" > {self.output_path}\\n',\n f'echo \"A,1,Ignore\" >> {self.output_path}\\n',\n f'echo \"B,2,Ignore\" >> {self.output_path}\\n',\n ]\n self._create_script(content)\n with self.assertRaises(ValueError):\n f_362(self.script_path, self.output_path)\n def test_case_5(self):\n # Test handling of files with too few columns\n content = [\n f'echo \"Name\" > {self.output_path}\\n',\n f'echo \"A\" >> {self.output_path}\\n',\n f'echo \"B\" >> {self.output_path}\\n',\n ]\n self._create_script(content)\n with self.assertRaises(ValueError):\n f_362(self.script_path, self.output_path)\n def test_case_6(self):\n # Test handling of empty file\n content = [f\"> {self.output_path}\\n\"]\n self._create_script(content)\n with self.assertRaises(ValueError):\n f_362(self.script_path, self.output_path)\n def test_case_7(self):\n # Test handling non-numeric values\n content = [\n f'echo \"Name,Value\" > {self.output_path}\\n',\n f'echo \"A,NonNumeric\" >> {self.output_path}\\n',\n f'echo \"B,2\" >> {self.output_path}\\n',\n ]\n self._create_script(content)\n with self.assertRaises(TypeError):\n f_362(self.script_path, self.output_path)\n def test_case_8(self):\n # Test handling missing values\n content = [\n f'echo \"Name,Value\" > {self.output_path}\\n',\n f'echo \"A,\" >> {self.output_path}\\n',\n f'echo \"B,2\" >> {self.output_path}\\n',\n ]\n self._create_script(content)\n df, _ = f_362(self.script_path, self.output_path)\n self.assertTrue(df.isnull().values.any())\n self.assertEqual(df.shape, (2, 2))\n def test_case_9(self):\n # Handle handling of non-exitent script\n with self.assertRaises(ValueError):\n f_362(\n os.path.join(self.temp_dir.name, \"invalid_script_nonexist.sh\"),\n self.output_path,\n )", "apis": ["pandas.read_csv", "subprocess.CalledProcessError", "subprocess.run"], "libs": ["subprocess", "pandas"], "doc": {"description": ["Executes a script to produce a CSV, reads the CSV, and plots a bar graph from the data.", "This function runs the provided script, which should generate a CSV file at the specified output path.", "The CSV must have exactly two columns. It then reads this CSV into a DataFrame and plots a bar graph,", "setting the first column as the x-axis labels and the second column as the bar heights.", "It will raise ValueError if the script fails to execute, or if the produced CSV is not valid."], "note": [], "params": ["script_path (str): Path to the script to be executed.", "output_file_path (str): Path where the script outputs the CSV."], "returns": ["df (pd.DataFrame): DataFrame containing the data from the CSV.", "ax (matplotlib.axes._axes.Axes): Axes object of the plotted bar graph."], "reqs": ["pandas", "subprocess"], "raises": [], "example": ["Examples:", ">>> df, ax = f_362(\"generate_data.sh\", \"data.csv\")", ">>> type(df)", "", ">>> type(ax)", ""]}} {"task_id": "f_817", "prompt": "import shutil\nimport pathlib\n\n\ndef f_817(source_path, destination_path):\n \"\"\"\n Lists files in the specified source directory without descending into subdirectories and copies them to a\n destination directory.\n\n Parameters:\n - source_path (str): The source directory path to analyze. Must be an existing, accessible directory.\n - destination_path (str): The destination directory path where files will be copied.\n If it does not exist, this function will create it.\n\n Returns:\n Tuple[str, List[str]]: A tuple containing the name of the source directory and a list of filenames (not\n full paths) that were copied.\n\n Raises:\n - ValueError: If source_path does not exist or is not a directory.\n\n Requirements:\n - shutil\n - pathlib\n\n Example:\n >>> x = f_817('/Docs/src/Scripts')\n >>> type(x)\n \n >>> x\n ('Scripts', ['file_1_in_scripts_dir.txt', 'file_2_in_scripts_dir.txt'])\n \"\"\"", "canonical_solution": " source_path = pathlib.Path(source_path).resolve()\n destination_path = pathlib.Path(destination_path).resolve()\n\n if not (source_path.exists() and source_path.is_dir()):\n raise ValueError(\"source_path must be an existing directory.\")\n\n destination_path.mkdir(parents=True, exist_ok=True)\n\n results = []\n for entry in source_path.iterdir():\n if entry.is_file():\n results.append(str(entry.name))\n shutil.copy(str(entry), str(destination_path))\n return (source_path.name, results)", "test": "import unittest\nimport tempfile\nimport pathlib\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.test_source_dir = pathlib.Path(self.temp_dir.name) / \"testf817-source\"\n self.test_target_dir = pathlib.Path(self.temp_dir.name) / \"testf817-target\"\n self.test_source_dir.mkdir(parents=True, exist_ok=True)\n self.test_target_dir.mkdir(parents=True, exist_ok=True)\n def tearDown(self):\n self.temp_dir.cleanup()\n def create_files(self, paths):\n for path in paths:\n full_path = self.test_source_dir / path\n full_path.parent.mkdir(parents=True, exist_ok=True)\n full_path.touch()\n def test_case_1(self):\n # Test empty directory\n target_dir_before = list(self.test_target_dir.iterdir())\n result = f_817(str(self.test_source_dir), str(self.test_target_dir))\n target_dir_after = list(self.test_target_dir.iterdir())\n self.assertEqual(result, (\"testf817-source\", []))\n self.assertEqual(target_dir_before, target_dir_after)\n def test_case_2(self):\n # Test directory with one file\n self.create_files([\"file1.txt\"])\n result = f_817(str(self.test_source_dir), str(self.test_target_dir))\n self.assertEqual(result, (\"testf817-source\", [\"file1.txt\"]))\n # Check if files are copied correctly\n self.assertEqual(\n list(self.test_target_dir.iterdir()), [self.test_target_dir / \"file1.txt\"]\n )\n def test_case_3(self):\n # Test directory with multiple files\n self.create_files([\"file1.txt\", \"file2.txt\", \"file3.txt\"])\n result = f_817(str(self.test_source_dir), str(self.test_target_dir))\n self.assertEqual(len(result), 2)\n self.assertEqual(result[0], \"testf817-source\")\n self.assertEqual(\n sorted(result[1]), sorted([\"file1.txt\", \"file2.txt\", \"file3.txt\"])\n )\n self.assertEqual(\n sorted(self.test_target_dir.iterdir()),\n sorted(\n [\n self.test_target_dir / \"file1.txt\",\n self.test_target_dir / \"file2.txt\",\n self.test_target_dir / \"file3.txt\",\n ]\n ),\n )\n def test_case_4(self):\n # Test directory with subdirectories\n self.test_source_dir.joinpath(\"subdir1\").mkdir()\n self.create_files([\"file1.txt\", \"file2.txt\"])\n self.create_files([\"subdir1/file3.txt\"]) # File inside subdirectory\n result = f_817(str(self.test_source_dir), str(self.test_target_dir))\n self.assertEqual(len(result), 2)\n self.assertEqual(result[0], \"testf817-source\")\n self.assertEqual(sorted(result[1]), sorted([\"file1.txt\", \"file2.txt\"]))\n # Check if files in subdirectories are ignored and only files in the source directory are copied\n self.assertEqual(\n sorted(self.test_target_dir.iterdir()),\n sorted(\n [self.test_target_dir / \"file1.txt\", self.test_target_dir / \"file2.txt\"]\n ),\n )\n def test_case_5(self):\n # Test non-existent source directory\n with self.assertRaises(ValueError):\n f_817(str(self.test_source_dir / \"nonexistent\"), str(self.test_target_dir))\n def test_case_6(self):\n # Test non-existent destination directory\n shutil.rmtree(self.test_target_dir)\n result = f_817(str(self.test_source_dir), str(self.test_target_dir))\n self.assertEqual(result, (\"testf817-source\", []))\n # Check if destination directory is created\n self.assertTrue(self.test_target_dir.exists())\n def test_case_7(self):\n # Test copying files to existing destination directory\n self.create_files([\"file1.txt\", \"file2.txt\"])\n result = f_817(str(self.test_source_dir), str(self.test_target_dir))\n self.assertEqual(sorted(result[1]), sorted([\"file1.txt\", \"file2.txt\"]))\n # Call the function again\n self.create_files([\"file3.txt\", \"file4.txt\"])\n result = f_817(str(self.test_source_dir), str(self.test_target_dir))\n # There should now be 4 files in the directory\n self.assertEqual(\n sorted(self.test_source_dir.iterdir()),\n sorted(\n [\n self.test_source_dir / \"file1.txt\",\n self.test_source_dir / \"file2.txt\",\n self.test_source_dir / \"file3.txt\",\n self.test_source_dir / \"file4.txt\",\n ]\n ),\n )\n # which means 4 files should have been copied\n self.assertEqual(\n sorted(result[1]),\n sorted([\"file1.txt\", \"file2.txt\", \"file3.txt\", \"file4.txt\"]),\n )\n # and 4 files should be in the destination\n self.assertEqual(\n sorted(self.test_target_dir.iterdir()),\n sorted(\n [\n self.test_target_dir / \"file1.txt\",\n self.test_target_dir / \"file2.txt\",\n self.test_target_dir / \"file3.txt\",\n self.test_target_dir / \"file4.txt\",\n ]\n ),\n )", "apis": ["shutil.copy", "pathlib.Path"], "libs": ["shutil", "pathlib"], "doc": {"description": ["Lists files in the specified source directory without descending into subdirectories and copies them to a", "destination directory."], "note": [], "params": ["source_path (str): The source directory path to analyze. Must be an existing, accessible directory.", "destination_path (str): The destination directory path where files will be copied.", "If it does not exist, this function will create it."], "returns": ["Tuple[str, List[str]]: A tuple containing the name of the source directory and a list of filenames (not", "full paths) that were copied."], "reqs": ["shutil", "pathlib"], "raises": ["ValueError: If source_path does not exist or is not a directory."], "example": [">>> x = f_817('/Docs/src/Scripts')", ">>> type(x)", "", ">>> x", "('Scripts', ['file_1_in_scripts_dir.txt', 'file_2_in_scripts_dir.txt'])"]}} -{"task_id": "f_384", "prompt": "from datetime import datetime\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_384(start_time, end_time, step, trend, seed=42):\n \"\"\"\n Generate a time series from a given epoch start time to end time with a specified step and trend.\n The time series is plotted with timestamps on the x-axis ('Time') and values on the y-axis ('Value').\n The values are generated from a normal distribution, and a linear trend is added based on the\n provided trend value.\n\n Parameters:\n - start_time (int): The start epoch time in milliseconds.\n - end_time (int): The end epoch time in milliseconds. Must be greater than start_time.\n - step (int): The step in milliseconds between each data point. Must be agreater than 0.\n - trend (float): The trend value to be added to the time series. It acts as a multiplier\n for the index, adding a linear trend to the randomly generated values.\n - seed (int, optional): Seed for reproducibility. Default is 42.\n\n Returns:\n - ax (plt.Axes): The Axes object of the generated plot.\n\n Requirements:\n - datetime.datetime\n - pandas\n - numpy\n\n Example:\n >>> ax = f_384(0, 10000, 100, 0.001)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-20.0, 0, '1970-01-01 10:00:08.000000'), Text(0.0, 0, '1970-01-01 10:00:00.000000'), Text(20.0, 0, '1970-01-01 10:00:02.000000'), Text(40.0, 0, '1970-01-01 10:00:04.000000'), Text(60.0, 0, '1970-01-01 10:00:06.000000'), Text(80.0, 0, '1970-01-01 10:00:08.000000'), Text(100.0, 0, ''), Text(120.0, 0, '')]\n \"\"\"", "canonical_solution": " if (start_time - end_time) > 0:\n raise ValueError(\"Start time must be before end time\")\n if step <= 0:\n raise ValueError(\"Invalid step value.\")\n np.random.seed(seed)\n\n timestamps = np.arange(start_time, end_time, step)\n df = pd.DataFrame(columns=[\"Time\", \"Value\"])\n values = np.random.normal(size=len(timestamps))\n\n for i, ts in enumerate(timestamps):\n dt = datetime.fromtimestamp(ts / 1000).strftime(\"%Y-%m-%d %H:%M:%S.%f\")\n value = values[i] + trend * i\n df.loc[i] = [dt, value]\n\n ax = df.plot(x=\"Time\", y=\"Value\")\n ax.set_ylabel(\"Value\")\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.default_start = 0\n self.default_end = 10000\n self.default_step = 100\n self.default_trend = 0.001\n self.default_seed = 42\n def test_case_1(self):\n ax = f_384(\n self.default_start, self.default_end, self.default_step, self.default_trend\n )\n self.assertIsInstance(ax, plt.Axes, \"Returned object is not an Axes instance.\")\n self.assertEqual(ax.get_xlabel(), \"Time\", \"X-axis label is incorrect.\")\n self.assertEqual(ax.get_ylabel(), \"Value\", \"Y-axis label is incorrect.\")\n def test_case_2(self):\n # Test with different seed for reproducibility\n ax1 = f_384(\n self.default_start,\n self.default_end,\n self.default_step,\n self.default_trend,\n seed=self.default_seed,\n )\n ax2 = f_384(\n self.default_start,\n self.default_end,\n self.default_step,\n self.default_trend,\n seed=self.default_seed,\n )\n self.assertTrue(\n np.array_equal(ax1.lines[0].get_ydata(), ax2.lines[0].get_ydata()),\n \"Data is not reproducible with the same seed.\",\n )\n def test_case_3(self):\n # Test with different seeds to ensure different results\n ax1 = f_384(\n self.default_start,\n self.default_end,\n self.default_step,\n self.default_trend,\n seed=self.default_seed,\n )\n ax2 = f_384(\n self.default_start,\n self.default_end,\n self.default_step,\n self.default_trend,\n seed=self.default_seed + 10,\n )\n self.assertFalse(\n np.array_equal(ax1.lines[0].get_ydata(), ax2.lines[0].get_ydata()),\n \"Data is the same with different seeds.\",\n )\n def test_case_4(self):\n # Test negative trend\n ax = f_384(self.default_start, self.default_end, self.default_step, -0.001)\n self.assertIsInstance(ax, plt.Axes)\n def test_case_5(self):\n # Test no trend\n ax = f_384(self.default_start, self.default_end, self.default_step, 0.0)\n self.assertIsInstance(ax, plt.Axes)\n def test_case_6(self):\n # Test when start time is greater than end time\n with self.assertRaises(Exception):\n f_384(10000, 0, self.default_step, self.default_trend)\n def test_case_7(self):\n # Function should fail when step is 0\n with self.assertRaises(Exception):\n f_384(self.default_start, self.default_end, 0, self.default_trend)\n def test_case_8(self):\n # Test time formatting\n ax = f_384(0, 1000, 100, 0.001)\n # Manually check one of the labels for correct formatting\n self.assertTrue(\n any([\"1970\" in label.get_text() for label in ax.get_xticklabels()])\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "numpy.random", "numpy.random.seed", "datetime.datetime.fromtimestamp", "numpy.random.normal", "pandas.DataFrame"], "libs": ["numpy", "pandas", "datetime"], "doc": {"description": ["Generate a time series from a given epoch start time to end time with a specified step and trend.", "The time series is plotted with timestamps on the x-axis ('Time') and values on the y-axis ('Value').", "The values are generated from a normal distribution, and a linear trend is added based on the", "provided trend value."], "note": [], "params": ["start_time (int): The start epoch time in milliseconds.", "end_time (int): The end epoch time in milliseconds. Must be greater than start_time.", "step (int): The step in milliseconds between each data point. Must be agreater than 0.", "trend (float): The trend value to be added to the time series. It acts as a multiplier", "for the index, adding a linear trend to the randomly generated values.", "seed (int, optional): Seed for reproducibility. Default is 42."], "returns": ["ax (plt.Axes): The Axes object of the generated plot."], "reqs": ["datetime.datetime", "pandas", "numpy"], "raises": [], "example": [">>> ax = f_384(0, 10000, 100, 0.001)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-20.0, 0, '1970-01-01 10:00:08.000000'), Text(0.0, 0, '1970-01-01 10:00:00.000000'), Text(20.0, 0, '1970-01-01 10:00:02.000000'), Text(40.0, 0, '1970-01-01 10:00:04.000000'), Text(60.0, 0, '1970-01-01 10:00:06.000000'), Text(80.0, 0, '1970-01-01 10:00:08.000000'), Text(100.0, 0, ''), Text(120.0, 0, '')]"]}} -{"task_id": "f_537", "prompt": "import pandas as pd\nimport random\n\ndef f_537(df):\n \"\"\"\n Generate a DataFrame that contains savegames for a number of games between different teams.\n Each row of the input DataFrame represents a match, and contains two teams and their respective scores.\n The function adds a 'winner' column to the DataFrame, which is the team with the highest score in each match.\n If the scores are equal, the winner is should be randomly decided.\n \n Parameters:\n - df (pandas.DataFrame): The input DataFrame with columns 'team1', 'team2', 'score1', 'score2'.\n\n Requirements:\n - pandas\n - random\n \n Returns:\n - df (pandas.DataFrame): The DataFrame with the added 'winner' column.\n \n Example:\n >>> import numpy as np\n >>> import pandas as pd\n >>> df = pd.DataFrame({'team1': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 20),\n ... 'team2': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 20),\n ... 'score1': np.random.randint(0, 10, 20),\n ... 'score2': np.random.randint(0, 10, 20)})\n >>> df = f_537(df)\n >>> assert 'winner' in df.columns\n >>> assert df['winner'].dtype == object\n >>> assert all(winner in ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'] for winner in df['winner'])\n \"\"\"", "canonical_solution": "\n def determine_winner(row):\n if row['score1'] > row['score2']:\n return row['team1']\n elif row['score1'] < row['score2']:\n return row['team2']\n else:\n return random.choice([row['team1'], row['team2']])\n \n # Using pd.Series to explicitly create a new Series for the 'winner' column\n winner_series = pd.Series([determine_winner(row) for index, row in df.iterrows()], index=df.index)\n df['winner'] = winner_series\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def setUp(self):\n random.seed(42)\n def test_case_1(self):\n df = pd.DataFrame({'team1': ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'],\n 'team2': ['Team B', 'Team C', 'Team D', 'Team E', 'Team A'],\n 'score1': [1, 2, 3, 4, 5],\n 'score2': [2, 3, 4, 5, 6]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team B', 'Team C', 'Team D', 'Team E', 'Team A'])))\n def test_case_2(self):\n df = pd.DataFrame({'team1': ['Team C', 'Team D', 'Team E', 'Team A', 'Team B'],\n 'team2': ['Team D', 'Team E', 'Team A', 'Team B', 'Team C'],\n 'score1': [99, 99, 99, 99, 99],\n 'score2': [99, 99, 99, 99, 99]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team C', 'Team D', 'Team A', 'Team A', 'Team B'])))\n def test_case_3(self):\n df = pd.DataFrame({'team1': ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'],\n 'team2': ['Team B', 'Team C', 'Team D', 'Team E', 'Team A'],\n 'score1': [0, 0, 0, 0, 0],\n 'score2': [0, 0, 0, 0, 0]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team A', 'Team B', 'Team D', 'Team D', 'Team E'])))\n \n def test_case_4(self):\n df = pd.DataFrame({'team1': ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'],\n 'team2': ['Team B', 'Team C', 'Team D', 'Team E', 'Team A'],\n 'score1': [10, 9, 8, 7, 6],\n 'score2': [9, 8, 7, 6, 5]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'])))\n \n def test_case_5(self):\n df = pd.DataFrame({'team1': ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'],\n 'team2': ['Team B', 'Team C', 'Team D', 'Team E', 'Team A'],\n 'score1': [10, 9, 8, 7, 6],\n 'score2': [11, 12, 13, 14, 15]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team B', 'Team C', 'Team D', 'Team E', 'Team A'])))", "apis": ["pandas.Series", "random.choice"], "libs": ["pandas", "random"], "doc": {"description": ["Generate a DataFrame that contains savegames for a number of games between different teams.", "Each row of the input DataFrame represents a match, and contains two teams and their respective scores.", "The function adds a 'winner' column to the DataFrame, which is the team with the highest score in each match.", "If the scores are equal, the winner is should be randomly decided."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame with columns 'team1', 'team2', 'score1', 'score2'."], "returns": ["df (pandas.DataFrame): The DataFrame with the added 'winner' column."], "reqs": ["pandas", "random"], "raises": [], "example": [">>> import numpy as np", ">>> import pandas as pd", ">>> df = pd.DataFrame({'team1': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 20),", "... 'team2': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 20),", "... 'score1': np.random.randint(0, 10, 20),", "... 'score2': np.random.randint(0, 10, 20)})", ">>> df = f_537(df)", ">>> assert 'winner' in df.columns", ">>> assert df['winner'].dtype == object", ">>> assert all(winner in ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'] for winner in df['winner'])"]}} +{"task_id": "f_384", "prompt": "from datetime import datetime\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_384(start_time, end_time, step, trend, seed=42):\n \"\"\"\n Generate a time series from a given epoch start time to end time with a specified step and trend.\n The time series is plotted with timestamps on the x-axis ('Time') and values on the y-axis ('Value').\n The values are generated from a normal distribution, and a linear trend is added based on the\n provided trend value.\n\n Parameters:\n - start_time (int): The start epoch time in milliseconds.\n - end_time (int): The end epoch time in milliseconds. Must be greater than start_time.\n - step (int): The step in milliseconds between each data point. Must be agreater than 0.\n - trend (float): The trend value to be added to the time series. It acts as a multiplier\n for the index, adding a linear trend to the randomly generated values.\n - seed (int, optional): Seed for reproducibility. Default is 42.\n\n Returns:\n - ax (plt.Axes): The Axes object of the generated plot.\n\n Requirements:\n - datetime.datetime\n - pandas\n - numpy\n\n Example:\n >>> ax = f_384(0, 10000, 100, 0.001)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-20.0, 0, '1970-01-01 10:00:08.000000'), Text(0.0, 0, '1970-01-01 10:00:00.000000'), Text(20.0, 0, '1970-01-01 10:00:02.000000'), Text(40.0, 0, '1970-01-01 10:00:04.000000'), Text(60.0, 0, '1970-01-01 10:00:06.000000'), Text(80.0, 0, '1970-01-01 10:00:08.000000'), Text(100.0, 0, ''), Text(120.0, 0, '')]\n \"\"\"", "canonical_solution": " if (start_time - end_time) > 0:\n raise ValueError(\"Start time must be before end time\")\n if step <= 0:\n raise ValueError(\"Invalid step value.\")\n np.random.seed(seed)\n\n timestamps = np.arange(start_time, end_time, step)\n df = pd.DataFrame(columns=[\"Time\", \"Value\"])\n values = np.random.normal(size=len(timestamps))\n\n for i, ts in enumerate(timestamps):\n dt = datetime.fromtimestamp(ts / 1000).strftime(\"%Y-%m-%d %H:%M:%S.%f\")\n value = values[i] + trend * i\n df.loc[i] = [dt, value]\n\n ax = df.plot(x=\"Time\", y=\"Value\")\n ax.set_ylabel(\"Value\")\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.default_start = 0\n self.default_end = 10000\n self.default_step = 100\n self.default_trend = 0.001\n self.default_seed = 42\n def test_case_1(self):\n ax = f_384(\n self.default_start, self.default_end, self.default_step, self.default_trend\n )\n self.assertIsInstance(ax, plt.Axes, \"Returned object is not an Axes instance.\")\n self.assertEqual(ax.get_xlabel(), \"Time\", \"X-axis label is incorrect.\")\n self.assertEqual(ax.get_ylabel(), \"Value\", \"Y-axis label is incorrect.\")\n def test_case_2(self):\n # Test with different seed for reproducibility\n ax1 = f_384(\n self.default_start,\n self.default_end,\n self.default_step,\n self.default_trend,\n seed=self.default_seed,\n )\n ax2 = f_384(\n self.default_start,\n self.default_end,\n self.default_step,\n self.default_trend,\n seed=self.default_seed,\n )\n self.assertTrue(\n np.array_equal(ax1.lines[0].get_ydata(), ax2.lines[0].get_ydata()),\n \"Data is not reproducible with the same seed.\",\n )\n def test_case_3(self):\n # Test with different seeds to ensure different results\n ax1 = f_384(\n self.default_start,\n self.default_end,\n self.default_step,\n self.default_trend,\n seed=self.default_seed,\n )\n ax2 = f_384(\n self.default_start,\n self.default_end,\n self.default_step,\n self.default_trend,\n seed=self.default_seed + 10,\n )\n self.assertFalse(\n np.array_equal(ax1.lines[0].get_ydata(), ax2.lines[0].get_ydata()),\n \"Data is the same with different seeds.\",\n )\n def test_case_4(self):\n # Test negative trend\n ax = f_384(self.default_start, self.default_end, self.default_step, -0.001)\n self.assertIsInstance(ax, plt.Axes)\n def test_case_5(self):\n # Test no trend\n ax = f_384(self.default_start, self.default_end, self.default_step, 0.0)\n self.assertIsInstance(ax, plt.Axes)\n def test_case_6(self):\n # Test when start time is greater than end time\n with self.assertRaises(Exception):\n f_384(10000, 0, self.default_step, self.default_trend)\n def test_case_7(self):\n # Function should fail when step is 0\n with self.assertRaises(Exception):\n f_384(self.default_start, self.default_end, 0, self.default_trend)\n def test_case_8(self):\n # Test time formatting\n ax = f_384(0, 1000, 100, 0.001)\n # Manually check one of the labels for correct formatting\n self.assertTrue(\n any([\"1970\" in label.get_text() for label in ax.get_xticklabels()])\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "pandas.DataFrame", "numpy.random", "datetime.datetime.fromtimestamp", "numpy.random.normal", "numpy.random.seed"], "libs": ["pandas", "numpy", "datetime"], "doc": {"description": ["Generate a time series from a given epoch start time to end time with a specified step and trend.", "The time series is plotted with timestamps on the x-axis ('Time') and values on the y-axis ('Value').", "The values are generated from a normal distribution, and a linear trend is added based on the", "provided trend value."], "note": [], "params": ["start_time (int): The start epoch time in milliseconds.", "end_time (int): The end epoch time in milliseconds. Must be greater than start_time.", "step (int): The step in milliseconds between each data point. Must be agreater than 0.", "trend (float): The trend value to be added to the time series. It acts as a multiplier", "for the index, adding a linear trend to the randomly generated values.", "seed (int, optional): Seed for reproducibility. Default is 42."], "returns": ["ax (plt.Axes): The Axes object of the generated plot."], "reqs": ["datetime.datetime", "pandas", "numpy"], "raises": [], "example": [">>> ax = f_384(0, 10000, 100, 0.001)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-20.0, 0, '1970-01-01 10:00:08.000000'), Text(0.0, 0, '1970-01-01 10:00:00.000000'), Text(20.0, 0, '1970-01-01 10:00:02.000000'), Text(40.0, 0, '1970-01-01 10:00:04.000000'), Text(60.0, 0, '1970-01-01 10:00:06.000000'), Text(80.0, 0, '1970-01-01 10:00:08.000000'), Text(100.0, 0, ''), Text(120.0, 0, '')]"]}} +{"task_id": "f_537", "prompt": "import pandas as pd\nimport random\n\ndef f_537(df):\n \"\"\"\n Generate a DataFrame that contains savegames for a number of games between different teams.\n Each row of the input DataFrame represents a match, and contains two teams and their respective scores.\n The function adds a 'winner' column to the DataFrame, which is the team with the highest score in each match.\n If the scores are equal, the winner is should be randomly decided.\n \n Parameters:\n - df (pandas.DataFrame): The input DataFrame with columns 'team1', 'team2', 'score1', 'score2'.\n\n Requirements:\n - pandas\n - random\n \n Returns:\n - df (pandas.DataFrame): The DataFrame with the added 'winner' column.\n \n Example:\n >>> import numpy as np\n >>> import pandas as pd\n >>> df = pd.DataFrame({'team1': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 20),\n ... 'team2': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 20),\n ... 'score1': np.random.randint(0, 10, 20),\n ... 'score2': np.random.randint(0, 10, 20)})\n >>> df = f_537(df)\n >>> assert 'winner' in df.columns\n >>> assert df['winner'].dtype == object\n >>> assert all(winner in ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'] for winner in df['winner'])\n \"\"\"", "canonical_solution": "\n def determine_winner(row):\n if row['score1'] > row['score2']:\n return row['team1']\n elif row['score1'] < row['score2']:\n return row['team2']\n else:\n return random.choice([row['team1'], row['team2']])\n \n # Using pd.Series to explicitly create a new Series for the 'winner' column\n winner_series = pd.Series([determine_winner(row) for index, row in df.iterrows()], index=df.index)\n df['winner'] = winner_series\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def setUp(self):\n random.seed(42)\n def test_case_1(self):\n df = pd.DataFrame({'team1': ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'],\n 'team2': ['Team B', 'Team C', 'Team D', 'Team E', 'Team A'],\n 'score1': [1, 2, 3, 4, 5],\n 'score2': [2, 3, 4, 5, 6]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team B', 'Team C', 'Team D', 'Team E', 'Team A'])))\n def test_case_2(self):\n df = pd.DataFrame({'team1': ['Team C', 'Team D', 'Team E', 'Team A', 'Team B'],\n 'team2': ['Team D', 'Team E', 'Team A', 'Team B', 'Team C'],\n 'score1': [99, 99, 99, 99, 99],\n 'score2': [99, 99, 99, 99, 99]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team C', 'Team D', 'Team A', 'Team A', 'Team B'])))\n def test_case_3(self):\n df = pd.DataFrame({'team1': ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'],\n 'team2': ['Team B', 'Team C', 'Team D', 'Team E', 'Team A'],\n 'score1': [0, 0, 0, 0, 0],\n 'score2': [0, 0, 0, 0, 0]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team A', 'Team B', 'Team D', 'Team D', 'Team E'])))\n \n def test_case_4(self):\n df = pd.DataFrame({'team1': ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'],\n 'team2': ['Team B', 'Team C', 'Team D', 'Team E', 'Team A'],\n 'score1': [10, 9, 8, 7, 6],\n 'score2': [9, 8, 7, 6, 5]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'])))\n \n def test_case_5(self):\n df = pd.DataFrame({'team1': ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'],\n 'team2': ['Team B', 'Team C', 'Team D', 'Team E', 'Team A'],\n 'score1': [10, 9, 8, 7, 6],\n 'score2': [11, 12, 13, 14, 15]})\n df = f_537(df)\n self.assertTrue('winner' in df.columns)\n self.assertTrue(df['winner'].equals(pd.Series(['Team B', 'Team C', 'Team D', 'Team E', 'Team A'])))", "apis": ["random.choice", "pandas.Series"], "libs": ["pandas", "random"], "doc": {"description": ["Generate a DataFrame that contains savegames for a number of games between different teams.", "Each row of the input DataFrame represents a match, and contains two teams and their respective scores.", "The function adds a 'winner' column to the DataFrame, which is the team with the highest score in each match.", "If the scores are equal, the winner is should be randomly decided."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame with columns 'team1', 'team2', 'score1', 'score2'."], "returns": ["df (pandas.DataFrame): The DataFrame with the added 'winner' column."], "reqs": ["pandas", "random"], "raises": [], "example": [">>> import numpy as np", ">>> import pandas as pd", ">>> df = pd.DataFrame({'team1': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 20),", "... 'team2': np.random.choice(['Team A', 'Team B', 'Team C', 'Team D', 'Team E'], 20),", "... 'score1': np.random.randint(0, 10, 20),", "... 'score2': np.random.randint(0, 10, 20)})", ">>> df = f_537(df)", ">>> assert 'winner' in df.columns", ">>> assert df['winner'].dtype == object", ">>> assert all(winner in ['Team A', 'Team B', 'Team C', 'Team D', 'Team E'] for winner in df['winner'])"]}} {"task_id": "f_861", "prompt": "from bs4 import BeautifulSoup\nimport requests\n\n# Constants\nURL = \"http://example.com\"\n\n\ndef f_861(url=URL, from_encoding=\"cp1251\", use_lxml=False):\n \"\"\"\n Fetches a web page from a given URL, decodes its content from a specified encoding,\n and returns the parsed HTML using BeautifulSoup. If specified, 'lxml' is used as\n the parser for improved performance. In case of any failure (like network issues,\n invalid URL, or decoding errors), the function returns None.\n\n Parameters:\n - url (str): The URL of the webpage to fetch. Defaults to the constant URL.\n - from_encoding (str): The original encoding of the webpage content. Defaults to 'cp1251'.\n - use_lxml (bool): Flag to use 'lxml' as the parser for BeautifulSoup. If False, the default 'html.parser' is used. Defaults to False.\n\n Returns:\n - BeautifulSoup object if the fetch and parse are successful.\n - None if the URL is invalid, the request fails, or parsing fails.\n\n Requirements:\n - bs4\n - requests\n\n Example:\n >>> html = f_861('http://example.com', 'cp1251', True)\n >>> print(html.prettify()) if html else print(\"Error fetching or parsing the webpage.\")\n\n Notes:\n - The function returns None if the URL is empty or None.\n - Network errors, HTTP errors, and decoding issues are caught and result in None being returned.\n - If the HTTP response status code is 200 (indicating success), the content is decoded using the specified encoding\n - If the response status code is not 200, it implies an unsuccessful HTTP request (e.g., 404 Not Found, 403 Forbidden).\n In such cases, the function returns None, indicating that the webpage could not be successfully retrieved or was not available.\n \n \"\"\"", "canonical_solution": " if not url:\n return None\n try:\n response = requests.get(url, timeout=5)\n response.raise_for_status()\n if response.status_code == 200:\n decoded_content = response.content.decode(from_encoding)\n parser = \"lxml\" if use_lxml else \"html.parser\"\n soup = BeautifulSoup(decoded_content, parser)\n return soup\n else:\n return None\n except Exception as e:\n print(f\"An error occurred: {e}\")\n return None", "test": "from bs4 import BeautifulSoup\nimport unittest\nfrom unittest.mock import patch, MagicMock\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_861.\"\"\"\n @patch(\"requests.get\")\n def test_successful_fetch_and_parse_html_parser(self, mock_get):\n \"\"\"Test if the function correctly fetches and parses a webpage with valid encoding using html.parser.\"\"\"\n mock_get.return_value = MagicMock(\n status_code=200, content=b\"Valid HTML content\"\n )\n result = f_861(\"http://example.com\", \"utf8\")\n self.assertIsInstance(result, BeautifulSoup)\n @patch(\"requests.get\")\n def test_successful_fetch_and_parse_lxml_parser(self, mock_get):\n \"\"\"Test if the function correctly fetches and parses a webpage with valid encoding using lxml.\"\"\"\n mock_get.return_value = MagicMock(\n status_code=200, content=b\"Valid HTML content\"\n )\n result = f_861(\"http://example.com\", \"utf8\", use_lxml=True)\n self.assertIsInstance(result, BeautifulSoup)\n @patch(\"requests.get\")\n def test_connection_error_handling(self, mock_get):\n \"\"\"Test how the function handles connection errors.\"\"\"\n mock_get.side_effect = requests.exceptions.ConnectionError()\n result = f_861(\"http://example.com\", \"utf8\")\n self.assertIsNone(result)\n @patch(\"requests.get\")\n def test_incorrect_encoding_handling(self, mock_get):\n \"\"\"Test how the function handles incorrect or unsupported encodings.\"\"\"\n mock_get.return_value = MagicMock(\n status_code=200, content=b\"Valid HTML content\"\n )\n result = f_861(\"http://example.com\", \"invalid_encoding\")\n self.assertIsNone(result)\n @patch(\"requests.get\")\n def test_status_code_handling(self, mock_get):\n \"\"\"Test if the function handles non-200 status code responses correctly.\"\"\"\n mock_get.return_value = MagicMock(status_code=404)\n result = f_861(\"http://example.com\", \"utf8\")\n self.assertIsNone(result)\n @patch(\"requests.get\")\n def test_empty_url_handling(self, mock_get):\n \"\"\"Test how the function handles an empty URL.\"\"\"\n result = f_861(\"\", \"utf8\")\n self.assertIsNone(result)", "apis": ["bs4.BeautifulSoup", "requests.get"], "libs": ["bs4", "requests"], "doc": {"description": ["Fetches a web page from a given URL, decodes its content from a specified encoding,", "and returns the parsed HTML using BeautifulSoup. If specified, 'lxml' is used as", "the parser for improved performance. In case of any failure (like network issues,", "invalid URL, or decoding errors), the function returns None.", "Notes:", "- The function returns None if the URL is empty or None.", "- Network errors, HTTP errors, and decoding issues are caught and result in None being returned.", "- If the HTTP response status code is 200 (indicating success), the content is decoded using the specified encoding", "- If the response status code is not 200, it implies an unsuccessful HTTP request (e.g., 404 Not Found, 403 Forbidden).", "In such cases, the function returns None, indicating that the webpage could not be successfully retrieved or was not available."], "note": [], "params": ["url (str): The URL of the webpage to fetch. Defaults to the constant URL.", "from_encoding (str): The original encoding of the webpage content. Defaults to 'cp1251'.", "use_lxml (bool): Flag to use 'lxml' as the parser for BeautifulSoup. If False, the default 'html.parser' is used. Defaults to False."], "returns": ["BeautifulSoup object if the fetch and parse are successful.", "None if the URL is invalid, the request fails, or parsing fails."], "reqs": ["bs4", "requests"], "raises": [], "example": [">>> html = f_861('http://example.com', 'cp1251', True)", ">>> print(html.prettify()) if html else print(\"Error fetching or parsing the webpage.\")"]}} -{"task_id": "f_904", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_904(df: pd.DataFrame, column_name: str) -> (str, plt.Axes):\n \"\"\"\n This function assesses whether the distribution of values in a specified column of a DataFrame is\n uniform and visualizes this distribution using a histogram.\n\n Parameters:\n - df (pd.DataFrame): The DataFrame containing the data.\n - column_name (str): The name of the column to be evaluated.\n\n Returns:\n - str: A message indicating whether the distribution in the column is uniform or not.\n - plt.Axes: An Axes object displaying the histogram of the value distribution in the specified column.\n\n The function handles the following cases:\n - If the DataFrame is empty, the specified column does not exist in the DataFrame, or\n if the specified column contains only null values, the function returns a message\n \"The DataFrame is empty or the specified column has no data.\"\n In this case, a blank histogram with a title \"Distribution of values in [column_name] (No Data)\" is generated.\n - If the DataFrame and column are valid, the function calculates if the distribution of values is uniform.\n It returns a message stating whether the distribution is uniform or not.\n A histogram is generated to visualize the distribution of values in the specified column.\n This histogram displays the frequency of each value, with the number of bins set to the number\n of unique values in the column, an edge color of black, and a transparency alpha value of 0.7.\n The x-axis is labeled \"Values\", the y-axis is labeled \"Frequency\", and\n the title of the plot is \"Distribution of values in [column_name]\".\n\n Requirements:\n - pandas\n - matplotlib\n\n Example:\n >>> df = pd.DataFrame({'Category': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'E', 'E']})\n >>> message, ax = f_904(df, 'Category')\n >>> print(message)\n The distribution of values is not uniform.\n \"\"\"", "canonical_solution": " if df.empty or column_name not in df.columns or df[column_name].isnull().all():\n message = \"The DataFrame is empty or the specified column has no data.\"\n _, ax = plt.subplots()\n ax.set_title(f\"Distribution of values in {column_name} (No Data)\")\n return message, ax\n\n unique_values_count = df[column_name].nunique()\n total_values = len(df[column_name])\n is_uniform = total_values % unique_values_count == 0 and all(\n df[column_name].value_counts() == total_values / unique_values_count\n )\n\n message = (\n \"The distribution of values is uniform.\"\n if is_uniform\n else \"The distribution of values is not uniform.\"\n )\n\n _, ax = plt.subplots()\n ax.hist(df[column_name], bins=unique_values_count, edgecolor=\"black\", alpha=0.7)\n ax.set_xticks(range(unique_values_count))\n ax.set_xlabel(\"Values\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(f\"Distribution of values in {column_name}\")\n\n return message, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for `f_904`.\"\"\"\n def test_uniform_distribution(self):\n \"\"\"Test the distribution of values in a column with a uniform distribution.\"\"\"\n df = pd.DataFrame({\"Category\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"]})\n message, _ = f_904(df, \"Category\")\n self.assertEqual(message, \"The distribution of values is uniform.\")\n def test_non_uniform_distribution(self):\n \"\"\"Test the distribution of values in a column with a non-uniform distribution.\"\"\"\n df = pd.DataFrame({\"Category\": [\"A\", \"A\", \"B\", \"B\", \"B\", \"C\", \"C\", \"C\", \"C\"]})\n message, _ = f_904(df, \"Category\")\n self.assertEqual(message, \"The distribution of values is not uniform.\")\n def test_single_value(self):\n \"\"\"Test the distribution of values in a column with a single value.\"\"\"\n df = pd.DataFrame({\"Category\": [\"A\", \"A\", \"A\", \"A\", \"A\", \"A\"]})\n message, _ = f_904(df, \"Category\")\n self.assertEqual(message, \"The distribution of values is uniform.\")\n def test_multi_column(self):\n \"\"\"Test the distribution of values in a column with a multi-column DataFrame.\"\"\"\n df = pd.DataFrame(\n {\n \"Category\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"],\n \"Type\": [\"X\", \"X\", \"Y\", \"Y\", \"Z\", \"Z\"],\n }\n )\n message, _ = f_904(df, \"Type\")\n self.assertEqual(message, \"The distribution of values is uniform.\")\n def test_empty_dataframe(self):\n \"\"\"Test the distribution of values in a column with an empty DataFrame.\"\"\"\n df = pd.DataFrame({\"Category\": []})\n message, _ = f_904(df, \"Category\")\n self.assertEqual(\n message, \"The DataFrame is empty or the specified column has no data.\"\n )\n def tearDown(self):\n plt.close()", "apis": ["matplotlib.pyplot.Axes", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["This function assesses whether the distribution of values in a specified column of a DataFrame is", "uniform and visualizes this distribution using a histogram.", "The function handles the following cases:", "- If the DataFrame is empty, the specified column does not exist in the DataFrame, or", "if the specified column contains only null values, the function returns a message", "\"The DataFrame is empty or the specified column has no data.\"", "In this case, a blank histogram with a title \"Distribution of values in [column_name] (No Data)\" is generated.", "- If the DataFrame and column are valid, the function calculates if the distribution of values is uniform.", "It returns a message stating whether the distribution is uniform or not.", "A histogram is generated to visualize the distribution of values in the specified column.", "This histogram displays the frequency of each value, with the number of bins set to the number", "of unique values in the column, an edge color of black, and a transparency alpha value of 0.7.", "The x-axis is labeled \"Values\", the y-axis is labeled \"Frequency\", and", "the title of the plot is \"Distribution of values in [column_name]\"."], "note": [], "params": ["df (pd.DataFrame): The DataFrame containing the data.", "column_name (str): The name of the column to be evaluated."], "returns": ["str: A message indicating whether the distribution in the column is uniform or not.", "plt.Axes: An Axes object displaying the histogram of the value distribution in the specified column."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> df = pd.DataFrame({'Category': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'E', 'E']})", ">>> message, ax = f_904(df, 'Category')", ">>> print(message)", "The distribution of values is not uniform."]}} +{"task_id": "f_904", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_904(df: pd.DataFrame, column_name: str) -> (str, plt.Axes):\n \"\"\"\n This function assesses whether the distribution of values in a specified column of a DataFrame is\n uniform and visualizes this distribution using a histogram.\n\n Parameters:\n - df (pd.DataFrame): The DataFrame containing the data.\n - column_name (str): The name of the column to be evaluated.\n\n Returns:\n - str: A message indicating whether the distribution in the column is uniform or not.\n - plt.Axes: An Axes object displaying the histogram of the value distribution in the specified column.\n\n The function handles the following cases:\n - If the DataFrame is empty, the specified column does not exist in the DataFrame, or\n if the specified column contains only null values, the function returns a message\n \"The DataFrame is empty or the specified column has no data.\"\n In this case, a blank histogram with a title \"Distribution of values in [column_name] (No Data)\" is generated.\n - If the DataFrame and column are valid, the function calculates if the distribution of values is uniform.\n It returns a message stating whether the distribution is uniform or not.\n A histogram is generated to visualize the distribution of values in the specified column.\n This histogram displays the frequency of each value, with the number of bins set to the number\n of unique values in the column, an edge color of black, and a transparency alpha value of 0.7.\n The x-axis is labeled \"Values\", the y-axis is labeled \"Frequency\", and\n the title of the plot is \"Distribution of values in [column_name]\".\n\n Requirements:\n - pandas\n - matplotlib\n\n Example:\n >>> df = pd.DataFrame({'Category': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'E', 'E']})\n >>> message, ax = f_904(df, 'Category')\n >>> print(message)\n The distribution of values is not uniform.\n \"\"\"", "canonical_solution": " if df.empty or column_name not in df.columns or df[column_name].isnull().all():\n message = \"The DataFrame is empty or the specified column has no data.\"\n _, ax = plt.subplots()\n ax.set_title(f\"Distribution of values in {column_name} (No Data)\")\n return message, ax\n\n unique_values_count = df[column_name].nunique()\n total_values = len(df[column_name])\n is_uniform = total_values % unique_values_count == 0 and all(\n df[column_name].value_counts() == total_values / unique_values_count\n )\n\n message = (\n \"The distribution of values is uniform.\"\n if is_uniform\n else \"The distribution of values is not uniform.\"\n )\n\n _, ax = plt.subplots()\n ax.hist(df[column_name], bins=unique_values_count, edgecolor=\"black\", alpha=0.7)\n ax.set_xticks(range(unique_values_count))\n ax.set_xlabel(\"Values\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(f\"Distribution of values in {column_name}\")\n\n return message, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for `f_904`.\"\"\"\n def test_uniform_distribution(self):\n \"\"\"Test the distribution of values in a column with a uniform distribution.\"\"\"\n df = pd.DataFrame({\"Category\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"]})\n message, _ = f_904(df, \"Category\")\n self.assertEqual(message, \"The distribution of values is uniform.\")\n def test_non_uniform_distribution(self):\n \"\"\"Test the distribution of values in a column with a non-uniform distribution.\"\"\"\n df = pd.DataFrame({\"Category\": [\"A\", \"A\", \"B\", \"B\", \"B\", \"C\", \"C\", \"C\", \"C\"]})\n message, _ = f_904(df, \"Category\")\n self.assertEqual(message, \"The distribution of values is not uniform.\")\n def test_single_value(self):\n \"\"\"Test the distribution of values in a column with a single value.\"\"\"\n df = pd.DataFrame({\"Category\": [\"A\", \"A\", \"A\", \"A\", \"A\", \"A\"]})\n message, _ = f_904(df, \"Category\")\n self.assertEqual(message, \"The distribution of values is uniform.\")\n def test_multi_column(self):\n \"\"\"Test the distribution of values in a column with a multi-column DataFrame.\"\"\"\n df = pd.DataFrame(\n {\n \"Category\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"],\n \"Type\": [\"X\", \"X\", \"Y\", \"Y\", \"Z\", \"Z\"],\n }\n )\n message, _ = f_904(df, \"Type\")\n self.assertEqual(message, \"The distribution of values is uniform.\")\n def test_empty_dataframe(self):\n \"\"\"Test the distribution of values in a column with an empty DataFrame.\"\"\"\n df = pd.DataFrame({\"Category\": []})\n message, _ = f_904(df, \"Category\")\n self.assertEqual(\n message, \"The DataFrame is empty or the specified column has no data.\"\n )\n def tearDown(self):\n plt.close()", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots", "matplotlib.pyplot.Axes"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["This function assesses whether the distribution of values in a specified column of a DataFrame is", "uniform and visualizes this distribution using a histogram.", "The function handles the following cases:", "- If the DataFrame is empty, the specified column does not exist in the DataFrame, or", "if the specified column contains only null values, the function returns a message", "\"The DataFrame is empty or the specified column has no data.\"", "In this case, a blank histogram with a title \"Distribution of values in [column_name] (No Data)\" is generated.", "- If the DataFrame and column are valid, the function calculates if the distribution of values is uniform.", "It returns a message stating whether the distribution is uniform or not.", "A histogram is generated to visualize the distribution of values in the specified column.", "This histogram displays the frequency of each value, with the number of bins set to the number", "of unique values in the column, an edge color of black, and a transparency alpha value of 0.7.", "The x-axis is labeled \"Values\", the y-axis is labeled \"Frequency\", and", "the title of the plot is \"Distribution of values in [column_name]\"."], "note": [], "params": ["df (pd.DataFrame): The DataFrame containing the data.", "column_name (str): The name of the column to be evaluated."], "returns": ["str: A message indicating whether the distribution in the column is uniform or not.", "plt.Axes: An Axes object displaying the histogram of the value distribution in the specified column."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> df = pd.DataFrame({'Category': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'E', 'E']})", ">>> message, ax = f_904(df, 'Category')", ">>> print(message)", "The distribution of values is not uniform."]}} {"task_id": "f_769", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import LabelEncoder\n\ndef f_769(file_path: str, column_name: str) -> pd.DataFrame:\n \"\"\"\n Load a CSV file into a Pandas DataFrame, replace all occurrences of the string '\\n' with the string '
'\n in the specified column, and encode the specified column as a categorical variable using LabelEncoder from sklearn.\n \n Parameters:\n - file_path (str): The path to the CSV file to be read.\n - column_name (str): The name of the column in which to replace '\\n' and to encode.\n \n Returns:\n pd.DataFrame: The updated and encoded Pandas DataFrame.\n \n Requirements:\n - pandas\n - sklearn.preprocessing.LabelEncoder\n \n Example:\n >>> df = f_769('data.csv', 'Category')\n >>> print(df.head())\n \"\"\"", "canonical_solution": " # Load the CSV file into a DataFrame\n df = pd.read_csv(file_path)\n \n # Replace occurrences of '\\n' with '
'\n df[column_name] = df[column_name].replace({'\\n': '
'}, regex=True)\n \n # Initialize LabelEncoder and fit_transform the specified column\n le = LabelEncoder()\n df[column_name] = le.fit_transform(df[column_name])\n \n return df", "test": "import os\nimport unittest\nimport pandas as pd\nimport shutil\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # create folder for test data\n os.makedirs('test_data', exist_ok=True)\n data = {\n 'Category': ['Fruit\\n', 'Vegetable\\n', 'Meat\\n', 'Dairy\\n'],\n 'Price': [1.2, 2.3, 3.4, 4.5]\n }\n pd.DataFrame(data).to_csv('test_data/test_case_1.csv', index=False)\n \n data = {\n 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],\n 'Age': [25, 30, 35, 40, 45],\n 'Language': ['Python\\nJava', 'C++\\nJavaScript', 'Ruby\\nC#', 'PHP\\nSwift', 'Kotlin\\nR']\n }\n pd.DataFrame(data).to_csv('test_data/test_case_2.csv', index=False)\n \n data = {\n 'Item': ['Item1', 'Item2', 'Item3', 'Item4', 'Item5']\n }\n pd.DataFrame(data).to_csv('test_data/test_case_3.csv', index=False)\n \n data = {\n 'Language': ['Python\\nJava', 'C++\\nJavaScript', 'Ruby\\nC#', 'PHP\\nSwift', 'Kotlin\\nR'],\n 'Country': ['USA', 'UK', 'China', 'Japan', 'Australia']\n }\n pd.DataFrame(data).to_csv('test_data/test_case_4.csv', index=False)\n \n def tearDown(self):\n shutil.rmtree('test_data')\n \n def test_case_1(self):\n # Input 1: A simple CSV file with a 'Category' column containing '\\n' characters\n # Expected: The '\\n' should be replaced with '
' and the column should be encoded\n df = f_769('test_data/test_case_1.csv', 'Category')\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIn('Category', df.columns)\n self.assertNotIn('\\n', df['Category'].astype(str))\n self.assertTrue(df['Category'].dtype.name == 'int64')\n \n def test_case_2(self):\n # Input 2: A CSV file with different columns\n # Expected: Only the specified column should be affected\n df = f_769('test_data/test_case_2.csv', 'Name')\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIn('Name', df.columns)\n self.assertNotIn('\\n', df['Name'].astype(str))\n self.assertTrue(df['Name'].dtype.name == 'int64')\n self.assertTrue(df['Age'].dtype.name == 'int64')\n \n def test_case_3(self):\n # Input 3: A CSV file with a column that doesn't contain '\\n'\n # Expected: The column should still be encoded\n df = f_769('test_data/test_case_3.csv', 'Item')\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIn('Item', df.columns)\n self.assertTrue(df['Item'].dtype.name == 'int64')\n \n def test_case_4(self):\n # Input 4: A CSV file with multiple columns, affecting only one\n # Expected: Only the specified column should be encoded\n df = f_769('test_data/test_case_4.csv', 'Language')\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIn('Language', df.columns)\n self.assertNotIn('\\n', df['Language'].astype(str))\n self.assertTrue(df['Language'].dtype.name == 'int64')\n self.assertTrue(df['Country'].dtype.name == 'object')\n \n def test_case_5(self):\n # Input 5: A CSV file with no columns matching the specified column\n # Expected: An exception should be raised\n with self.assertRaises(Exception):\n df = f_769('test_data/test_case_5.csv', 'NonExistentColumn')", "apis": ["pandas.read_csv", "pandas.DataFrame", "sklearn.preprocessing.LabelEncoder"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Load a CSV file into a Pandas DataFrame, replace all occurrences of the string '\\n' with the string '
'", "in the specified column, and encode the specified column as a categorical variable using LabelEncoder from sklearn."], "note": [], "params": ["file_path (str): The path to the CSV file to be read.", "column_name (str): The name of the column in which to replace '\\n' and to encode."], "returns": ["pd.DataFrame: The updated and encoded Pandas DataFrame."], "reqs": ["pandas", "sklearn.preprocessing.LabelEncoder"], "raises": [], "example": [">>> df = f_769('data.csv', 'Category')", ">>> print(df.head())"]}} -{"task_id": "f_780", "prompt": "import re\nfrom collections import Counter\n\ndef f_780(input_str):\n \"\"\"\n Count the frequency of each alphanumeric character in a given string after removing all non-alphanumeric characters,\n treating uppercase and lowercase letters as the same.\n\n Requirements:\n - re\n - collections.Counter\n\n Parameters:\n - input_str (str): The input string containing alphanumeric characters mixed with special characters and/or spaces.\n\n Returns:\n - dict: A dictionary with characters as keys (all lowercase) and their frequencies in the input string as values.\n \n Examples:\n >>> f_780(\"Hello, World!\")\n Counter({'l': 3, 'o': 2, 'h': 1, 'e': 1, 'w': 1, 'r': 1, 'd': 1})\n \"\"\"", "canonical_solution": " cleaned_str = re.sub('[^A-Za-z0-9]+', '', input_str).lower()\n freq_dict = Counter(cleaned_str)\n return freq_dict", "test": "import unittest\nclass TestF780(unittest.TestCase):\n def test_only_letters(self):\n # Expected output adjusted for lowercase\n self.assertEqual(f_780(\"Hello, World!\"), {'h': 1, 'e': 1, 'l': 3, 'o': 2, 'w': 1, 'r': 1, 'd': 1})\n def test_empty_string(self):\n self.assertEqual(f_780(\"\"), {})\n def test_repeated_numbers(self):\n self.assertEqual(f_780(\"12345 12345\"), {'1': 2, '2': 2, '3': 2, '4': 2, '5': 2})\n def test_mixed_case_letters(self):\n # Expecting all lowercase after adjustment for case insensitivity\n self.assertEqual(f_780(\"AAaaBBbbCCcc\"), {'a': 4, 'b': 4, 'c': 4})\n def test_numbers_only(self):\n self.assertEqual(f_780(\"111222333444555\"), {'1': 3, '2': 3, '3': 3, '4': 3, '5': 3})\n def test_uppercase_only(self):\n # Expecting all lowercase after adjustment for case insensitivity\n self.assertEqual(f_780(\"AAAABBBBCCCC\"), {'a': 4, 'b': 4, 'c': 4})\n def test_no_alphanumeric(self):\n self.assertEqual(f_780(\"!!!@@@###$$$%%%^^^&&&\"), {})", "apis": ["re.sub", "collections.Counter"], "libs": ["collections", "re"], "doc": {"description": ["Count the frequency of each alphanumeric character in a given string after removing all non-alphanumeric characters,", "treating uppercase and lowercase letters as the same."], "note": [], "params": ["input_str (str): The input string containing alphanumeric characters mixed with special characters and/or spaces."], "returns": ["dict: A dictionary with characters as keys (all lowercase) and their frequencies in the input string as values."], "reqs": ["re", "collections.Counter"], "raises": [], "example": ["Examples:", ">>> f_780(\"Hello, World!\")", "Counter({'l': 3, 'o': 2, 'h': 1, 'e': 1, 'w': 1, 'r': 1, 'd': 1})"]}} -{"task_id": "f_867", "prompt": "import numpy as np\nimport pandas as pd\nimport seaborn as sns\n\n# Constants\nPLOT_TITLE = \"Value Distribution\"\n\n\ndef f_867(data_dict):\n \"\"\"\n Processes a dictionary of numerical data to create a pandas DataFrame, removes None values, and generates a histogram \n of the data values using seaborn. The histogram's bins are dynamically calculated based on the range of the data. \n If the DataFrame is empty or the data lacks variability (all values are the same after removing None values), \n the function does not generate a plot.\n\n Parameters:\n - data_dict (dict): A dictionary with keys as column names and values as lists of numerical data. \n The data can include None values, which will be removed.\n\n Returns:\n - DataFrame: A pandas DataFrame created from the input dictionary, excluding None values.\n - AxesSubplot or None: A seaborn histogram plot object if the DataFrame contains variable data; \n None if the DataFrame is empty or if all values are identical.\n\n Requirements:\n - pandas\n - numpy\n - seaborn\n\n Note:\n - Calculates the minimum and maximum values in the DataFrame.\n - Dynamically sets the number of bins for the histogram based on the number of data points, with a minimum of 2 \n and a maximum of 11 bins.\n - Create evenly spaced bin edges between the minimum and maximum values.\n - KDE (Kernel Density Estimate) is turned off. \n - Sets the plot title to the predefined constant `PLOT_TITLE`.\n\n\n Example:\n >>> data = {'a': [1, 2, 3, None], 'b': [5, 6, None, 8]}\n >>> df, plot = f_867(data)\n >>> df\n a b\n 0 1.0 5.0\n 1 2.0 6.0\n >>> plot.get_title() if plot is not None else 'No plot generated'\n 'Value Distribution'\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data_dict).dropna()\n\n if df.empty or df.nunique().min() < 2:\n return df, None\n\n min_val, max_val = df.values.min(), df.values.max()\n num_bins = max(min(11, len(df) // 2), 2)\n bin_edges = np.linspace(min_val, max_val, num_bins)\n\n plot = sns.histplot(df.values.flatten(), bins=bin_edges, kde=False)\n plot.set_title(PLOT_TITLE)\n\n return df, plot", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for function f_867.\"\"\"\n def test_dataframe_creation(self):\n \"\"\"\n Test if the function correctly creates a DataFrame from the input dictionary.\n \"\"\"\n data = {\"a\": [1, 2, 3, 4], \"b\": [5, 6, 7, 8]}\n df, _ = f_867(data)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.shape, (4, 2))\n def test_distribution_plot(self):\n \"\"\"\n Test if the function correctly creates a distribution plot with the correct title and non-empty bars.\n \"\"\"\n data = {\"a\": [1, 2, 3, 4], \"b\": [5, 6, 7, 8]}\n _, plot = f_867(data)\n self.assertEqual(plot.get_title(), \"Value Distribution\")\n self.assertTrue(len(plot.patches) > 0)\n def test_empty_dictionary(self):\n \"\"\"\n Test if the function correctly handles an empty dictionary, returning an empty DataFrame and no plot.\n \"\"\"\n data = {}\n df, plot = f_867(data)\n self.assertEqual(df.shape, (0, 0))\n self.assertIsNone(plot)\n def test_number_of_bins(self):\n \"\"\"\n Test if the function dynamically calculates the number of bins for the plot based on the data.\n \"\"\"\n data = {\"a\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}\n _, plot = f_867(data)\n self.assertTrue(len(plot.patches) <= 11)\n def test_dataframe_without_none(self):\n \"\"\"\n Test if the function correctly removes rows with None values from the DataFrame.\n \"\"\"\n data = {\"a\": [1, 2, None, 4], \"b\": [5, None, 7, 8]}\n df, _ = f_867(data)\n self.assertEqual(df.shape, (2, 2))\n self.assertNotIn(None, df.values.flatten())", "apis": ["pandas.DataFrame", "numpy.linspace", "seaborn.histplot"], "libs": ["numpy", "pandas", "seaborn"], "doc": {"description": ["Processes a dictionary of numerical data to create a pandas DataFrame, removes None values, and generates a histogram", "of the data values using seaborn. The histogram's bins are dynamically calculated based on the range of the data.", "If the DataFrame is empty or the data lacks variability (all values are the same after removing None values),", "the function does not generate a plot."], "note": ["Calculates the minimum and maximum values in the DataFrame.", "Dynamically sets the number of bins for the histogram based on the number of data points, with a minimum of 2", "and a maximum of 11 bins.", "Create evenly spaced bin edges between the minimum and maximum values.", "KDE (Kernel Density Estimate) is turned off.", "Sets the plot title to the predefined constant `PLOT_TITLE`."], "params": ["data_dict (dict): A dictionary with keys as column names and values as lists of numerical data.", "The data can include None values, which will be removed."], "returns": ["DataFrame: A pandas DataFrame created from the input dictionary, excluding None values.", "AxesSubplot or None: A seaborn histogram plot object if the DataFrame contains variable data;", "None if the DataFrame is empty or if all values are identical."], "reqs": ["pandas", "numpy", "seaborn"], "raises": [], "example": [">>> data = {'a': [1, 2, 3, None], 'b': [5, 6, None, 8]}", ">>> df, plot = f_867(data)", ">>> df", "a b", "0 1.0 5.0", "1 2.0 6.0", ">>> plot.get_title() if plot is not None else 'No plot generated'", "'Value Distribution'"]}} +{"task_id": "f_780", "prompt": "import re\nfrom collections import Counter\n\ndef f_780(input_str):\n \"\"\"\n Count the frequency of each alphanumeric character in a given string after removing all non-alphanumeric characters,\n treating uppercase and lowercase letters as the same.\n\n Requirements:\n - re\n - collections.Counter\n\n Parameters:\n - input_str (str): The input string containing alphanumeric characters mixed with special characters and/or spaces.\n\n Returns:\n - dict: A dictionary with characters as keys (all lowercase) and their frequencies in the input string as values.\n \n Examples:\n >>> f_780(\"Hello, World!\")\n Counter({'l': 3, 'o': 2, 'h': 1, 'e': 1, 'w': 1, 'r': 1, 'd': 1})\n \"\"\"", "canonical_solution": " cleaned_str = re.sub('[^A-Za-z0-9]+', '', input_str).lower()\n freq_dict = Counter(cleaned_str)\n return freq_dict", "test": "import unittest\nclass TestF780(unittest.TestCase):\n def test_only_letters(self):\n # Expected output adjusted for lowercase\n self.assertEqual(f_780(\"Hello, World!\"), {'h': 1, 'e': 1, 'l': 3, 'o': 2, 'w': 1, 'r': 1, 'd': 1})\n def test_empty_string(self):\n self.assertEqual(f_780(\"\"), {})\n def test_repeated_numbers(self):\n self.assertEqual(f_780(\"12345 12345\"), {'1': 2, '2': 2, '3': 2, '4': 2, '5': 2})\n def test_mixed_case_letters(self):\n # Expecting all lowercase after adjustment for case insensitivity\n self.assertEqual(f_780(\"AAaaBBbbCCcc\"), {'a': 4, 'b': 4, 'c': 4})\n def test_numbers_only(self):\n self.assertEqual(f_780(\"111222333444555\"), {'1': 3, '2': 3, '3': 3, '4': 3, '5': 3})\n def test_uppercase_only(self):\n # Expecting all lowercase after adjustment for case insensitivity\n self.assertEqual(f_780(\"AAAABBBBCCCC\"), {'a': 4, 'b': 4, 'c': 4})\n def test_no_alphanumeric(self):\n self.assertEqual(f_780(\"!!!@@@###$$$%%%^^^&&&\"), {})", "apis": ["re.sub", "collections.Counter"], "libs": ["re", "collections"], "doc": {"description": ["Count the frequency of each alphanumeric character in a given string after removing all non-alphanumeric characters,", "treating uppercase and lowercase letters as the same."], "note": [], "params": ["input_str (str): The input string containing alphanumeric characters mixed with special characters and/or spaces."], "returns": ["dict: A dictionary with characters as keys (all lowercase) and their frequencies in the input string as values."], "reqs": ["re", "collections.Counter"], "raises": [], "example": ["Examples:", ">>> f_780(\"Hello, World!\")", "Counter({'l': 3, 'o': 2, 'h': 1, 'e': 1, 'w': 1, 'r': 1, 'd': 1})"]}} +{"task_id": "f_867", "prompt": "import numpy as np\nimport pandas as pd\nimport seaborn as sns\n\n# Constants\nPLOT_TITLE = \"Value Distribution\"\n\n\ndef f_867(data_dict):\n \"\"\"\n Processes a dictionary of numerical data to create a pandas DataFrame, removes None values, and generates a histogram \n of the data values using seaborn. The histogram's bins are dynamically calculated based on the range of the data. \n If the DataFrame is empty or the data lacks variability (all values are the same after removing None values), \n the function does not generate a plot.\n\n Parameters:\n - data_dict (dict): A dictionary with keys as column names and values as lists of numerical data. \n The data can include None values, which will be removed.\n\n Returns:\n - DataFrame: A pandas DataFrame created from the input dictionary, excluding None values.\n - Axes or None: A seaborn histogram plot object if the DataFrame contains variable data; \n None if the DataFrame is empty or if all values are identical.\n\n Requirements:\n - pandas\n - numpy\n - seaborn\n\n Note:\n - Calculates the minimum and maximum values in the DataFrame.\n - Dynamically sets the number of bins for the histogram based on the number of data points, with a minimum of 2 \n and a maximum of 11 bins.\n - Create evenly spaced bin edges between the minimum and maximum values.\n - KDE (Kernel Density Estimate) is turned off. \n - Sets the plot title to the predefined constant `PLOT_TITLE`.\n\n\n Example:\n >>> data = {'a': [1, 2, 3, None], 'b': [5, 6, None, 8]}\n >>> df, plot = f_867(data)\n >>> df\n a b\n 0 1.0 5.0\n 1 2.0 6.0\n >>> plot.get_title() if plot is not None else 'No plot generated'\n 'Value Distribution'\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data_dict).dropna()\n\n if df.empty or df.nunique().min() < 2:\n return df, None\n\n min_val, max_val = df.values.min(), df.values.max()\n num_bins = max(min(11, len(df) // 2), 2)\n bin_edges = np.linspace(min_val, max_val, num_bins)\n\n plot = sns.histplot(df.values.flatten(), bins=bin_edges, kde=False)\n plot.set_title(PLOT_TITLE)\n\n return df, plot", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for function f_867.\"\"\"\n def test_dataframe_creation(self):\n \"\"\"\n Test if the function correctly creates a DataFrame from the input dictionary.\n \"\"\"\n data = {\"a\": [1, 2, 3, 4], \"b\": [5, 6, 7, 8]}\n df, _ = f_867(data)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.shape, (4, 2))\n def test_distribution_plot(self):\n \"\"\"\n Test if the function correctly creates a distribution plot with the correct title and non-empty bars.\n \"\"\"\n data = {\"a\": [1, 2, 3, 4], \"b\": [5, 6, 7, 8]}\n _, plot = f_867(data)\n self.assertEqual(plot.get_title(), \"Value Distribution\")\n self.assertTrue(len(plot.patches) > 0)\n def test_empty_dictionary(self):\n \"\"\"\n Test if the function correctly handles an empty dictionary, returning an empty DataFrame and no plot.\n \"\"\"\n data = {}\n df, plot = f_867(data)\n self.assertEqual(df.shape, (0, 0))\n self.assertIsNone(plot)\n def test_number_of_bins(self):\n \"\"\"\n Test if the function dynamically calculates the number of bins for the plot based on the data.\n \"\"\"\n data = {\"a\": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}\n _, plot = f_867(data)\n self.assertTrue(len(plot.patches) <= 11)\n def test_dataframe_without_none(self):\n \"\"\"\n Test if the function correctly removes rows with None values from the DataFrame.\n \"\"\"\n data = {\"a\": [1, 2, None, 4], \"b\": [5, None, 7, 8]}\n df, _ = f_867(data)\n self.assertEqual(df.shape, (2, 2))\n self.assertNotIn(None, df.values.flatten())", "apis": ["pandas.DataFrame", "numpy.linspace", "seaborn.histplot"], "libs": ["seaborn", "numpy", "pandas"], "doc": {"description": ["Processes a dictionary of numerical data to create a pandas DataFrame, removes None values, and generates a histogram", "of the data values using seaborn. The histogram's bins are dynamically calculated based on the range of the data.", "If the DataFrame is empty or the data lacks variability (all values are the same after removing None values),", "the function does not generate a plot."], "note": ["Calculates the minimum and maximum values in the DataFrame.", "Dynamically sets the number of bins for the histogram based on the number of data points, with a minimum of 2", "and a maximum of 11 bins.", "Create evenly spaced bin edges between the minimum and maximum values.", "KDE (Kernel Density Estimate) is turned off.", "Sets the plot title to the predefined constant `PLOT_TITLE`."], "params": ["data_dict (dict): A dictionary with keys as column names and values as lists of numerical data.", "The data can include None values, which will be removed."], "returns": ["DataFrame: A pandas DataFrame created from the input dictionary, excluding None values.", "Axes or None: A seaborn histogram plot object if the DataFrame contains variable data;", "None if the DataFrame is empty or if all values are identical."], "reqs": ["pandas", "numpy", "seaborn"], "raises": [], "example": [">>> data = {'a': [1, 2, 3, None], 'b': [5, 6, None, 8]}", ">>> df, plot = f_867(data)", ">>> df", "a b", "0 1.0 5.0", "1 2.0 6.0", ">>> plot.get_title() if plot is not None else 'No plot generated'", "'Value Distribution'"]}} {"task_id": "f_561", "prompt": "import math\nimport pandas as pd\n\ndef f_561(tuples_list):\n \"\"\"\n Given a list of tuples turn them into a Pandas DataFrame with math.sin applied to each number.\n\n Parameters:\n - tuples_list (list): The list of tuples.\n \n Returns:\n - df (DataFrame): A pandas DataFrame. Each row of df corresponds to a tuple from tuples_list, with the values being the sine of the original values in the tuple.\n\n Requirements:\n - math\n - pandas\n\n Example:\n >>> df = f_561([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)])\n >>> print(df)\n 0 1 2 3\n 0 0.841471 0.909297 0.141120 -0.756802\n 1 -0.958924 -0.279415 0.656987 0.989358\n 2 0.412118 -0.544021 -0.999990 -0.536573\n \"\"\"", "canonical_solution": " df = pd.DataFrame([(math.sin(n) for n in t) for t in tuples_list])\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = f_561([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)])\n self.assertEqual(df.shape, (3, 4))\n self.assertEqual(df.iloc[0, 0], math.sin(1))\n self.assertEqual(df.iloc[0, 1], math.sin(2))\n self.assertEqual(df.iloc[0, 2], math.sin(3))\n self.assertEqual(df.iloc[0, 3], math.sin(4))\n self.assertEqual(df.iloc[1, 0], math.sin(5))\n self.assertEqual(df.iloc[1, 1], math.sin(6))\n self.assertEqual(df.iloc[1, 2], math.sin(7))\n self.assertEqual(df.iloc[1, 3], math.sin(8))\n self.assertEqual(df.iloc[2, 0], math.sin(9))\n self.assertEqual(df.iloc[2, 1], math.sin(10))\n self.assertEqual(df.iloc[2, 2], math.sin(11))\n self.assertEqual(df.iloc[2, 3], math.sin(12))\n def test_case_2(self):\n df = f_561([(1, 2, 3, 4)])\n self.assertEqual(df.shape, (1, 4))\n self.assertEqual(df.iloc[0, 0], math.sin(1))\n self.assertEqual(df.iloc[0, 1], math.sin(2))\n self.assertEqual(df.iloc[0, 2], math.sin(3))\n self.assertEqual(df.iloc[0, 3], math.sin(4))\n def test_case_3(self):\n df = f_561([(1, 2, 3, 4), (5, 6, 7, 8)])\n self.assertEqual(df.shape, (2, 4))\n self.assertEqual(df.iloc[0, 0], math.sin(1))\n self.assertEqual(df.iloc[0, 1], math.sin(2))\n self.assertEqual(df.iloc[0, 2], math.sin(3))\n self.assertEqual(df.iloc[0, 3], math.sin(4))\n self.assertEqual(df.iloc[1, 0], math.sin(5))\n self.assertEqual(df.iloc[1, 1], math.sin(6))\n self.assertEqual(df.iloc[1, 2], math.sin(7))\n self.assertEqual(df.iloc[1, 3], math.sin(8))\n def test_case_4(self):\n df = f_561([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12), (13, 14, 15, 16)])\n self.assertEqual(df.shape, (4, 4))\n self.assertEqual(df.iloc[0, 0], math.sin(1))\n self.assertEqual(df.iloc[0, 1], math.sin(2))\n self.assertEqual(df.iloc[0, 2], math.sin(3))\n self.assertEqual(df.iloc[0, 3], math.sin(4))\n self.assertEqual(df.iloc[1, 0], math.sin(5))\n self.assertEqual(df.iloc[1, 1], math.sin(6))\n self.assertEqual(df.iloc[1, 2], math.sin(7))\n self.assertEqual(df.iloc[1, 3], math.sin(8))\n self.assertEqual(df.iloc[2, 0], math.sin(9))\n self.assertEqual(df.iloc[2, 1], math.sin(10))\n self.assertEqual(df.iloc[2, 2], math.sin(11))\n self.assertEqual(df.iloc[2, 3], math.sin(12))\n self.assertEqual(df.iloc[3, 0], math.sin(13))\n self.assertEqual(df.iloc[3, 1], math.sin(14))\n self.assertEqual(df.iloc[3, 2], math.sin(15))\n self.assertEqual(df.iloc[3, 3], math.sin(16))\n def test_case_5(self):\n df = f_561([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12), (13, 14, 15, 16), (17, 18, 19, 20)])\n self.assertEqual(df.shape, (5, 4))\n self.assertEqual(df.iloc[0, 0], math.sin(1))\n self.assertEqual(df.iloc[0, 1], math.sin(2))\n self.assertEqual(df.iloc[0, 2], math.sin(3))\n self.assertEqual(df.iloc[0, 3], math.sin(4))\n self.assertEqual(df.iloc[1, 0], math.sin(5))\n self.assertEqual(df.iloc[1, 1], math.sin(6))\n self.assertEqual(df.iloc[1, 2], math.sin(7))\n self.assertEqual(df.iloc[1, 3], math.sin(8))\n self.assertEqual(df.iloc[2, 0], math.sin(9))\n self.assertEqual(df.iloc[2, 1], math.sin(10))\n self.assertEqual(df.iloc[2, 2], math.sin(11))\n self.assertEqual(df.iloc[2, 3], math.sin(12))\n self.assertEqual(df.iloc[3, 0], math.sin(13))\n self.assertEqual(df.iloc[3, 1], math.sin(14))\n self.assertEqual(df.iloc[3, 2], math.sin(15))\n self.assertEqual(df.iloc[3, 3], math.sin(16))\n self.assertEqual(df.iloc[4, 0], math.sin(17))\n self.assertEqual(df.iloc[4, 1], math.sin(18))\n self.assertEqual(df.iloc[4, 2], math.sin(19))\n self.assertEqual(df.iloc[4, 3], math.sin(20))", "apis": ["pandas.DataFrame", "math.sin"], "libs": ["pandas", "math"], "doc": {"description": ["Given a list of tuples turn them into a Pandas DataFrame with math.sin applied to each number."], "note": [], "params": ["tuples_list (list): The list of tuples."], "returns": ["df (DataFrame): A pandas DataFrame. Each row of df corresponds to a tuple from tuples_list, with the values being the sine of the original values in the tuple."], "reqs": ["math", "pandas"], "raises": [], "example": [">>> df = f_561([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)])", ">>> print(df)", "0 1 2 3", "0 0.841471 0.909297 0.141120 -0.756802", "1 -0.958924 -0.279415 0.656987 0.989358", "2 0.412118 -0.544021 -0.999990 -0.536573"]}} -{"task_id": "f_424", "prompt": "import sqlite3\nimport pandas as pd\n\n\ndef f_424(db_name, table_name):\n \"\"\"\n Plot the relationship between the first and second numerical columns of an SQLite3 table.\n\n Parameters:\n - db_name (str): The absolute path to the SQLite3 database.\n - table_name (str): The name of the table to plot from.\n\n Returns:\n - matplotlib.axes._axes.Axes: Scatterplot with column name labeled on their respective axes.\n\n Requirements:\n - sqlite3\n - pandas\n\n Example:\n >>> ax = f_424('/path/to/database/test.db', 'People')\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0.9400000000000001, 0, '0.94'), ... ]\n \"\"\"", "canonical_solution": " # Connect to the SQLite database\n conn = sqlite3.connect(db_name)\n\n # Dynamically get the first two numerical columns from the table (excluding 'id')\n df = pd.read_sql_query(f\"SELECT * from {table_name}\", conn)\n numerical_columns = df.select_dtypes(include=[\"float64\", \"int64\"]).columns.tolist()\n if \"id\" in numerical_columns:\n numerical_columns.remove(\"id\")\n if len(numerical_columns) < 2:\n raise ValueError(\"The table must have at least two numerical columns to plot.\")\n\n # Plot the relationship between the two columns\n ax = df.plot.scatter(x=numerical_columns[0], y=numerical_columns[1])\n return ax", "test": "import unittest\nimport sqlite3\nimport os\nimport matplotlib.pyplot as plt\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.test_db_path = os.path.join(self.temp_dir.name, \"test.db\")\n self.another_test_db_path = os.path.join(self.temp_dir.name, \"another_test.db\")\n self.nonexistent_db_path = os.path.join(self.temp_dir.name, \"nonexistent.db\")\n # Setup for 'test.db'\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n \"CREATE TABLE People (id INTEGER PRIMARY KEY, name TEXT, age INTEGER, height REAL)\"\n )\n self.data = [\n (\"Alice\", 25, 5.5),\n (\"Bob\", 30, 6.0),\n (\"Charlie\", 35, 5.8),\n (\"David\", 40, 6.2),\n (\"Eve\", 45, 5.9),\n (\"Frank\", 50, 5.6),\n ]\n cur.executemany(\n \"INSERT INTO People (name, age, height) VALUES (?, ?, ?)\", self.data\n )\n # Setup for 'another_test.db'\n with sqlite3.connect(self.another_test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n \"CREATE TABLE Animals (id INTEGER PRIMARY KEY, name TEXT, lifespan INTEGER, weight REAL)\"\n )\n animal_data = [\n (\"Dog\", 13, 30.0),\n (\"Cat\", 15, 4.5),\n (\"Elephant\", 70, 6000.0),\n (\"Dolphin\", 20, 150.0),\n ]\n cur.executemany(\n \"INSERT INTO Animals (name, lifespan, weight) VALUES (?, ?, ?)\",\n animal_data,\n )\n def tearDown(self):\n self.temp_dir.cleanup()\n plt.close(\"all\")\n def test_case_1(self):\n # Test basic functionality\n ax = f_424(self.test_db_path, \"People\")\n self.assertEqual(ax.get_xlabel(), \"age\")\n self.assertEqual(ax.get_ylabel(), \"height\")\n self.assertEqual(len(ax.collections[0].get_offsets()), 6)\n def test_case_2(self):\n # Test handling non-existent table\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"NonExistentTable\")\n def test_case_3(self):\n # Test handling non-existent db\n with self.assertRaises(Exception):\n f_424(self.nonexistent_db_path, \"People\")\n def test_case_4(self):\n # Table with removed numerical column should raise error\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n f\"CREATE TABLE temp AS SELECT id, name, age FROM People WHERE name IN ('Alice', 'Bob')\"\n )\n cur.execute(f\"DROP TABLE People\")\n cur.execute(f\"ALTER TABLE temp RENAME TO People\")\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"People\")\n # Revert changes\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(f\"CREATE TABLE temp AS SELECT * FROM People\")\n cur.execute(f\"DROP TABLE People\")\n cur.execute(\n f\"CREATE TABLE People (id INTEGER PRIMARY KEY, name TEXT, age INTEGER, height REAL)\"\n )\n cur.executemany(\n f\"INSERT INTO People (name, age, height) VALUES (?, ?, ?)\", self.data\n )\n def test_case_5(self):\n # Test another set of data/db\n ax = f_424(self.another_test_db_path, \"Animals\")\n self.assertEqual(ax.get_xlabel(), \"lifespan\")\n self.assertEqual(ax.get_ylabel(), \"weight\")\n self.assertEqual(len(ax.collections[0].get_offsets()), 4)\n def test_case_6(self):\n # Test handling of a table with only one numerical column\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n \"CREATE TABLE SingleNumCol (id INTEGER PRIMARY KEY, name TEXT, age INTEGER)\"\n )\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"SingleNumCol\")\n def test_case_7(self):\n # Test handling of a table with no numerical columns\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n \"CREATE TABLE NoNumCols (id INTEGER PRIMARY KEY, name TEXT, description TEXT)\"\n )\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"NoNumCols\")\n def test_case_8(self):\n # Test a table where 'id' is the only numerical column\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\"CREATE TABLE OnlyIDNum (id INTEGER PRIMARY KEY, name TEXT)\")\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"OnlyIDNum\")\n def test_case_9(self):\n # Test plotting when the first two numerical columns are not 'id', 'age', or 'height'\n with sqlite3.connect(self.another_test_db_path) as conn:\n cur = conn.cursor()\n custom_data = [(\"Lion\", 15, 190.5), (\"Tiger\", 20, 220.0)]\n cur.executemany(\n \"INSERT INTO Animals (name, lifespan, weight) VALUES (?, ?, ?)\",\n custom_data,\n )\n ax = f_424(self.another_test_db_path, \"Animals\")\n self.assertEqual(ax.get_xlabel(), \"lifespan\")\n self.assertEqual(ax.get_ylabel(), \"weight\")\n self.assertGreaterEqual(len(ax.collections[0].get_offsets()), 2)", "apis": ["sqlite3.connect", "pandas.read_sql_query"], "libs": ["sqlite3", "pandas"], "doc": {"description": ["Plot the relationship between the first and second numerical columns of an SQLite3 table."], "note": [], "params": ["db_name (str): The absolute path to the SQLite3 database.", "table_name (str): The name of the table to plot from."], "returns": ["matplotlib.axes._axes.Axes: Scatterplot with column name labeled on their respective axes."], "reqs": ["sqlite3", "pandas"], "raises": [], "example": [">>> ax = f_424('/path/to/database/test.db', 'People')", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0.9400000000000001, 0, '0.94'), ... ]"]}} -{"task_id": "f_845", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_845(data, column_name=\"target_column\"):\n \"\"\"\n Converts a given JSON data into a Pandas DataFrame and plots a histogram of a specified column.\n The function handles non-numeric columns by converting them to categorical type and then to numeric codes. \n It also checks if the specified column exists in the DataFrame.\n\n - The histogram's title is set to 'Histogram of '.\n - The histogram's x-label are set to the name of the specified column.\n \n Parameters:\n - data (list of dict)\n - column_name (str, optional)\n\n Returns:\n - DataFrame: A pandas DataFrame created from the input JSON data.\n - Axes: A matplotlib Axes object showing the histogram plot of the specified column.\n\n Exceptions:\n - ValueError: Raised if the specified column name does not exist in the DataFrame.\n\n Requirements:\n - pandas\n - matplotlib\n\n Example:\n >>> sample_data = [{'userId': 1, 'value': 10}, {'userId': 2, 'value': 15}]\n >>> df, ax = f_845(sample_data, 'userId')\n >>> print(df)\n userId value\n 0 1 10\n 1 2 15\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n\n if column_name not in df.columns:\n raise ValueError(f\"Column '{column_name}' not found in the DataFrame.\")\n\n if not pd.api.types.is_numeric_dtype(df[column_name]):\n df[column_name] = df[column_name].astype(\"category\").cat.codes\n\n _, ax = plt.subplots()\n df[column_name].hist(ax=ax)\n ax.set_title(f\"Histogram of {column_name}\")\n ax.set_xlabel(column_name)\n return df, ax", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_845 function.\"\"\"\n def setUp(self):\n # Sample data for testing\n self.sample_data = [\n {\"userId\": 1, \"id\": 1, \"title\": \"A\", \"completed\": False},\n {\"userId\": 1, \"id\": 2, \"title\": \"B\", \"completed\": True},\n {\"userId\": 2, \"id\": 3, \"title\": \"A\", \"completed\": False},\n {\"userId\": 2, \"id\": 4, \"title\": \"B\", \"completed\": True},\n {\"userId\": 3, \"id\": 5, \"title\": \"A\", \"completed\": False},\n {\"userId\": 3, \"id\": 6, \"title\": \"B\", \"completed\": True},\n {\"userId\": 3, \"id\": 7, \"title\": \"B\", \"completed\": True},\n ]\n def test_normal_case(self):\n \"\"\"Test if the function returns correct DataFrame and histogram for a valid column.\"\"\"\n df, ax = f_845(self.sample_data, \"userId\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(len(df), len(self.sample_data))\n self.assertEqual(ax.get_title(), \"Histogram of userId\")\n self.assertEqual(ax.get_xlabel(), \"userId\")\n def test_non_existent_column(self):\n \"\"\"Test if the function raises an error for a non-existent column.\"\"\"\n with self.assertRaises(ValueError):\n f_845(self.sample_data, \"non_existent_column\")\n def test_empty_data(self):\n \"\"\"Test the function with empty data.\"\"\"\n with self.assertRaises(ValueError):\n f_845([], \"userId\")\n def test_non_numeric_data(self):\n \"\"\"Test the function with a non-numeric column.\"\"\"\n df, ax = f_845(self.sample_data, \"title\")\n self.assertTrue(pd.api.types.is_numeric_dtype(df[\"title\"]))\n self.assertEqual(ax.get_title(), \"Histogram of title\")\n self.assertEqual(ax.get_xlabel(), \"title\")\n def test_duplicate_values(self):\n \"\"\"Test the function with a column that has duplicate values.\"\"\"\n df, ax = f_845(self.sample_data, \"title\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(ax.get_title(), \"Histogram of title\")\n self.assertEqual(ax.get_xlabel(), \"title\")\n def tearDown(self):\n plt.clf()", "apis": ["pandas.api.types.is_numeric_dtype", "pandas.api", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Converts a given JSON data into a Pandas DataFrame and plots a histogram of a specified column.", "The function handles non-numeric columns by converting them to categorical type and then to numeric codes.", "It also checks if the specified column exists in the DataFrame.", "- The histogram's title is set to 'Histogram of '.", "- The histogram's x-label are set to the name of the specified column.", "Exceptions:", "- ValueError: Raised if the specified column name does not exist in the DataFrame."], "note": [], "params": ["data (list of dict)", "column_name (str, optional)"], "returns": ["DataFrame: A pandas DataFrame created from the input JSON data.", "Axes: A matplotlib Axes object showing the histogram plot of the specified column."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> sample_data = [{'userId': 1, 'value': 10}, {'userId': 2, 'value': 15}]", ">>> df, ax = f_845(sample_data, 'userId')", ">>> print(df)", "userId value", "0 1 10", "1 2 15"]}} -{"task_id": "f_808", "prompt": "import os\nimport re\nimport shutil\n\n\ndef f_808(source_directory: str, target_directory: str, pattern: str = r\"\\d{4}\") -> int:\n \"\"\"\n Moves files matching a specific regex pattern from a source directory to a target directory.\n\n Parameters:\n - source_directory (str): Path of the source directory from which files will be moved.\n - target_directory (str): Path of the target directory to which files will be moved.\n - pattern (str): Regex pattern to match filenames.\n Defaults to r'\\\\d{4}' to match filenames containing four contiguous digits.\n\n Returns:\n - int: The number of files successfully moved.\n\n Requirements:\n - os\n - re\n - shutil\n\n Note:\n - If source_directory does not exist or is not a directory, this function returns 0.\n - If target_directory does not exist, this function will create it.\n\n Examples:\n >>> os.listdir('/path/to/source')\n ['1000.txt', '1001.txt', '1002.txt', 'not_a_match.txt']\n >>> f_808('/path/to/source', '/path/to/target')\n 3\n \"\"\"", "canonical_solution": " moved_files_count = 0\n\n if not os.path.exists(source_directory) or not os.path.isdir(source_directory):\n return 0\n\n if not os.path.exists(target_directory):\n os.makedirs(target_directory)\n\n for root, _, files in os.walk(source_directory):\n for file in files:\n if re.search(pattern, file):\n shutil.move(\n os.path.join(root, file), os.path.join(target_directory, file)\n )\n moved_files_count += 1\n\n return moved_files_count", "test": "import unittest\nimport tempfile\nimport os\nclass TestCases(unittest.TestCase):\n def create_test_files(self, directory, file_names):\n # Helper to create files for testing\n for file_name in file_names:\n with open(os.path.join(directory, file_name), \"a\") as file:\n file.write(\"test content\")\n def test_files_moved(self):\n # Test basic case with default pattern\n with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:\n self.create_test_files(\n src,\n [\n \"1234.txt\",\n \"test5678.txt\",\n \"nope.txt\",\n \"another1234.txt\",\n \"4321done.txt\",\n ],\n )\n result = f_808(src, dst)\n self.assertEqual(\n result, 4, \"Should move 4 files matching the default pattern.\"\n )\n for file_name in [\n \"1234.txt\",\n \"another1234.txt\",\n \"4321done.txt\",\n \"test5678.txt\",\n ]:\n self.assertTrue(\n os.path.exists(os.path.join(dst, file_name)),\n f\"{file_name} should be in the target directory\",\n )\n def test_files_moved_with_custom_pattern(self):\n # Test case with custom pattern\n with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:\n self.create_test_files(\n src,\n [\n \"1234.txt\",\n \"test5678.txt\",\n \"nope.txt\",\n \"another1234.txt\",\n \"4321done.txt\",\n ],\n )\n result = f_808(src, dst, r\"test\\w+\")\n self.assertEqual(\n result, 1, \"Should move 1 file matching the custom pattern 'test\\\\w+.'\"\n )\n def test_no_files_moved_if_no_match(self):\n # Test no match\n with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:\n self.create_test_files(src, [\"nope.txt\"])\n result = f_808(src, dst)\n self.assertEqual(result, 0, \"Should move 0 files if no match.\")\n def test_return_zero_if_source_does_not_exist(self):\n # Test source_directory if not exists\n with tempfile.TemporaryDirectory() as dst:\n result = f_808(os.path.join(dst, \"non_existing_dir\"), dst)\n self.assertEqual(\n result, 0, \"Should return 0 if source directory does not exist.\"\n )\n def test_target_directory_created_if_not_exist(self):\n # Test that destination directory will be created if it did not exist\n with tempfile.TemporaryDirectory() as src:\n self.create_test_files(src, [\"1234.txt\"])\n new_target = os.path.join(src, \"new_target_dir\")\n f_808(src, new_target)\n self.assertTrue(\n os.path.exists(new_target),\n \"Target directory should be created if it does not exist.\",\n )\n def test_no_files_in_source(self):\n # Test empty source direcotry\n with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:\n result = f_808(src, dst)\n self.assertEqual(\n result, 0, \"Should move 0 files if source directory is empty.\"\n )", "apis": ["os.path.exists", "re.search", "shutil.move", "os.walk", "os.path.isdir", "os.path", "os.makedirs", "os.path.join"], "libs": ["re", "shutil", "os"], "doc": {"description": ["Moves files matching a specific regex pattern from a source directory to a target directory."], "note": ["If source_directory does not exist or is not a directory, this function returns 0.", "If target_directory does not exist, this function will create it."], "params": ["source_directory (str): Path of the source directory from which files will be moved.", "target_directory (str): Path of the target directory to which files will be moved.", "pattern (str): Regex pattern to match filenames.", "Defaults to r'\\\\d{4}' to match filenames containing four contiguous digits."], "returns": ["int: The number of files successfully moved."], "reqs": ["os", "re", "shutil"], "raises": [], "example": ["Examples:", ">>> os.listdir('/path/to/source')", "['1000.txt', '1001.txt', '1002.txt', 'not_a_match.txt']", ">>> f_808('/path/to/source', '/path/to/target')", "3"]}} -{"task_id": "f_421", "prompt": "import sqlite3\nimport numpy as np\nfrom random import choice, seed\n\n\ndef f_421(db_path, table_name, num_entries, random_seed=None):\n \"\"\"\n Insert random data into an SQLite3 table that contains random names, ages, and heights.\n If the table does not exist, it will be created.\n This function uses the following constants:\n - NAMES: List of possible names ['John', 'Jane', 'Steve', 'Emma', 'Liam', 'Olivia'].\n - AGES: Range of possible ages from 18 to 64.\n - HEIGHTS: Range of possible heights from 150cm to 199cm.\n\n Parameters:\n db_path (str): The path to the SQLite3 database file.\n table_name (str): The name of the table to insert data into.\n num_entries (int): The number of entries to insert. Must not be negative.\n random_seed (int, optional): Seed for random number generation. Defaults to None (no fixed seed).\n\n Returns:\n int: The number of rows inserted.\n\n Requirements:\n - sqlite3\n - numpy\n - random.choice\n - random.seed\n\n Example:\n >>> f_421('path_to_test.db', 'People', 100, random_seed=42)\n 100\n \"\"\"", "canonical_solution": " # Setting the random seed if provided\n if random_seed is not None:\n seed(random_seed)\n np.random.seed(random_seed)\n\n if num_entries < 0:\n raise ValueError(\"num_entries cannot be negative.\")\n\n NAMES = [\"John\", \"Jane\", \"Steve\", \"Emma\", \"Liam\", \"Olivia\"]\n AGES = list(range(18, 65))\n HEIGHTS = list(range(150, 200))\n\n conn = sqlite3.connect(db_path)\n cur = conn.cursor()\n\n table_creation_sql = (\n \"CREATE TABLE IF NOT EXISTS {} (name TEXT, age INTEGER, height INTEGER)\".format(\n table_name\n )\n )\n cur.execute(table_creation_sql)\n\n inserted_rows = 0\n for _ in range(num_entries):\n name = choice(NAMES)\n age = choice(AGES)\n height = choice(HEIGHTS)\n insertion_sql = \"INSERT INTO {} VALUES (?, ?, ?)\".format(table_name)\n cur.execute(insertion_sql, (name, age, height))\n inserted_rows += cur.rowcount\n\n conn.commit()\n\n return inserted_rows", "test": "import unittest\nimport os\nimport sqlite3\nimport tempfile\nclass TestCases(unittest.TestCase):\n NAMES = [\"John\", \"Jane\", \"Steve\", \"Emma\", \"Liam\", \"Olivia\"]\n AGES = range(18, 65)\n HEIGHTS = range(150, 200)\n def setUp(self):\n # Setup a temporary directory before each test\n self.temp_dir = tempfile.TemporaryDirectory()\n self.db_path = os.path.join(self.temp_dir.name, \"test.db\")\n def tearDown(self):\n # Clean up the temporary directory after each test\n self.temp_dir.cleanup()\n def test_case_1(self):\n # Test inserting 50 entries with a fixed seed\n result = f_421(self.db_path, \"SamplePeople\", 50, random_seed=42)\n self.assertEqual(result, 50)\n def test_case_2(self):\n # Test inserting 30 entries into a new table with a fixed seed\n result = f_421(self.db_path, \"NewPeople\", 30, random_seed=42)\n self.assertEqual(result, 30)\n def test_case_3(self):\n # Test inserting 20 entries, verifying smaller batch works as expected\n result = f_421(self.db_path, \"SamplePeople\", 20, random_seed=42)\n self.assertEqual(result, 20)\n def test_case_4(self):\n # Test inserting a large number of entries (200) with a fixed seed\n result = f_421(self.db_path, \"SamplePeople\", 200, random_seed=42)\n self.assertEqual(result, 200)\n def test_case_5(self):\n # Test inserting 0 entries to check handling of empty input\n result = f_421(self.db_path, \"SamplePeople\", 0, random_seed=42)\n self.assertEqual(result, 0)\n def test_case_6(self):\n # Test the content of the rows for correctness against expected values\n f_421(self.db_path, \"ContentCheck\", 10, random_seed=42)\n conn = sqlite3.connect(self.db_path)\n cur = conn.cursor()\n cur.execute(\"SELECT * FROM ContentCheck\")\n rows = cur.fetchall()\n for row in rows:\n self.assertIn(row[0], self.NAMES)\n self.assertIn(row[1], self.AGES)\n self.assertIn(row[2], self.HEIGHTS)\n def test_case_7(self):\n # Test invalid db path\n with self.assertRaises(sqlite3.OperationalError):\n f_421(\"/invalid/path.db\", \"TestTable\", 10)\n def test_case_8(self):\n # Test invalid table names (SQL keywords)\n with self.assertRaises(sqlite3.OperationalError):\n f_421(self.db_path, \"Select\", 10)\n def test_case_9(self):\n # Test handling invalid num_entries\n with self.assertRaises(Exception):\n f_421(self.db_path, \"TestTable\", -1)\n with self.assertRaises(TypeError):\n f_421(self.db_path, \"TestTable\", \"ten\")\n def test_case_10(self):\n # Test handling invalid random seed\n with self.assertRaises(Exception):\n f_421(self.db_path, \"TestTable\", 10, random_seed=\"invalid\")\n def test_case_11(self):\n # Test different schema in existing table\n conn = sqlite3.connect(self.db_path)\n cur = conn.cursor()\n cur.execute(\"CREATE TABLE TestTable (id INTEGER)\")\n conn.close()\n with self.assertRaises(sqlite3.OperationalError):\n f_421(self.db_path, \"TestTable\", 10)\n def test_case_12(self):\n # Insert a known set of data and verify its integrity\n f_421(self.db_path, \"IntegrityCheck\", 1, random_seed=42)\n conn = sqlite3.connect(self.db_path)\n cur = conn.cursor()\n cur.execute(\"SELECT * FROM IntegrityCheck\")\n row = cur.fetchone()\n self.assertIsNotNone(row)\n def test_case_13(self):\n # Test against SQL injection in table_name parameter\n malicious_name = \"Test; DROP TABLE IntegrityCheck;\"\n with self.assertRaises(sqlite3.OperationalError):\n f_421(self.db_path, malicious_name, 1)", "apis": ["random.seed", "random.choice", "numpy.random", "numpy.random.seed", "sqlite3.connect"], "libs": ["numpy", "sqlite3", "random"], "doc": {"description": ["Insert random data into an SQLite3 table that contains random names, ages, and heights.", "If the table does not exist, it will be created.", "This function uses the following constants:", "- NAMES: List of possible names ['John', 'Jane', 'Steve', 'Emma', 'Liam', 'Olivia'].", "- AGES: Range of possible ages from 18 to 64.", "- HEIGHTS: Range of possible heights from 150cm to 199cm."], "note": [], "params": ["db_path (str): The path to the SQLite3 database file.", "table_name (str): The name of the table to insert data into.", "num_entries (int): The number of entries to insert. Must not be negative.", "random_seed (int, optional): Seed for random number generation. Defaults to None (no fixed seed)."], "returns": ["int: The number of rows inserted."], "reqs": ["sqlite3", "numpy", "random.choice", "random.seed"], "raises": [], "example": [">>> f_421('path_to_test.db', 'People', 100, random_seed=42)", "100"]}} -{"task_id": "f_426", "prompt": "from collections import Counter\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport itertools\n\ndef f_426(list_of_menuitems, title=\"Menu Distribution\", color=\"blue\", width=1.0):\n \"\"\"\n Given a nested list of menu items, flatten the list using itertool chain, count the occurrences of each item, then\n plot a histogram with an alphabetically sorted x-axis labeled as \"Menu Items\" and y-axis as \"Frequency\".\n\n Parameters:\n - list_of_menuitems (list): A non-empty nested list of menu items. Each element is a list of menu item strings.\n - title (str, optional): The title of the histogram plot. Default is \"Menu Distribution\".\n - color (str, optional): The color of the bars in the histogram. Default is \"blue\".\n - width (float, optional): The width of the bars in the histogram. Default is 1.0.\n\n Returns:\n - ax (object): An Axes object representing the histogram plot.\n\n Requirements:\n - collections.Counter\n - numpy\n - matplotlib.pyplot\n - itertools\n\n Example:\n >>> f_426([['Pizza', 'Burger'], ['Pizza', 'Coke'], ['Pasta', 'Coke']])\n \n >>> f_426(['Burger'], title='A Title', color='red', width=5.0)\n \n \"\"\"", "canonical_solution": " # Flatten the list\n flat_list = list(itertools.chain(*list_of_menuitems))\n\n # Count the occurrences of each menu item\n counter = Counter(flat_list)\n labels, values = zip(*sorted(counter.items(), key=lambda x: x[0]))\n indexes = np.arange(len(labels))\n\n # Plot the histogram\n fig, ax = plt.subplots()\n ax.bar(indexes, values, width, color=color)\n ax.set_xticklabels(labels)\n ax.set_xlabel(\"Menu Items\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(title)\n\n return ax", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n input_data = [[\"Pizza\", \"Burger\"], [\"Pizza\", \"Coke\"], [\"Pasta\", \"Coke\"]]\n ax = f_426(input_data)\n # Test default plot properties\n self.assertEqual(ax.get_title(), \"Menu Distribution\")\n self.assertEqual(ax.get_xlabel(), \"Menu Items\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n for p in ax.patches:\n # RGBA color\n self.assertEqual(p.get_facecolor(), (0.0, 0.0, 1.0, 1.0))\n # bar width\n self.assertEqual(p.get_width(), 1.0)\n def test_case_2(self):\n input_data = [[\"Pizza\", \"Burger\"], [\"Pizza\", \"Coke\"], [\"Pasta\", \"Coke\"]]\n ax = f_426(input_data, title=\"Custom Title\", color=\"red\", width=0.8)\n # Test custom plot properties\n self.assertEqual(ax.get_title(), \"Custom Title\")\n self.assertEqual(ax.get_xlabel(), \"Menu Items\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n for p in ax.patches:\n # RGBA color\n self.assertEqual(p.get_facecolor(), (1.0, 0.0, 0.0, 1.0))\n # bar width\n self.assertEqual(p.get_width(), 0.8)\n def test_case_3(self):\n input_data = [[\"Burger\"], [\"Pizza\"], [\"Pasta\"]]\n ax = f_426(input_data)\n # Test count\n bars = [p.get_height() for p in ax.patches]\n self.assertEqual(bars, [1, 1, 1])\n def test_case_4(self):\n input_data = [[\"Carrot\", \"Apple\"], [\"Apple\", \"Banana\"], [\"Banana\"]]\n ax = f_426(input_data)\n # Test x-axis order\n self.assertEqual(\n [_._text for _ in ax.get_xticklabels() if _._text],\n [\"Apple\", \"Banana\", \"Carrot\"],\n )\n def test_case_5(self):\n # Test input edge case: some empty elements\n ax = f_426([[], [\"Apple\"]])\n self.assertEqual(len(ax.patches), 1)\n for p in ax.patches:\n # bar width\n self.assertEqual(p.get_width(), 1.0)\n self.assertEqual(p.get_height(), 1)\n def test_case_6(self):\n with self.assertRaises(ValueError):\n f_426([])\n with self.assertRaises(ValueError):\n f_426([[]])\n with self.assertRaises(ValueError):\n f_426(\"\")\n with self.assertRaises(TypeError):\n f_426(None)\n with self.assertRaises(TypeError):\n f_426(1)\n with self.assertRaises(TypeError):\n f_426([1])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "collections.Counter", "matplotlib.pyplot.subplots", "itertools.chain"], "libs": ["numpy", "collections", "itertools", "matplotlib"], "doc": {"description": ["Given a nested list of menu items, flatten the list using itertool chain, count the occurrences of each item, then", "plot a histogram with an alphabetically sorted x-axis labeled as \"Menu Items\" and y-axis as \"Frequency\"."], "note": [], "params": ["list_of_menuitems (list): A non-empty nested list of menu items. Each element is a list of menu item strings.", "title (str, optional): The title of the histogram plot. Default is \"Menu Distribution\".", "color (str, optional): The color of the bars in the histogram. Default is \"blue\".", "width (float, optional): The width of the bars in the histogram. Default is 1.0."], "returns": ["ax (object): An Axes object representing the histogram plot."], "reqs": ["collections.Counter", "numpy", "matplotlib.pyplot", "itertools"], "raises": [], "example": [">>> f_426([['Pizza', 'Burger'], ['Pizza', 'Coke'], ['Pasta', 'Coke']])", "", ">>> f_426(['Burger'], title='A Title', color='red', width=5.0)", ""]}} -{"task_id": "f_749", "prompt": "import os\nimport re\n\ndef f_749(pattern: str, replacement: str, directory: str) -> bool:\n \"\"\"\n Renames all files in a directory that match a particular pattern with a given replacement string.\n \n Functionality:\n - Searches for files in the specified directory whose names contain the given pattern.\n - Renames those files by replacing the pattern with the given replacement string.\n \n Input:\n - pattern (str): The pattern to search for in the filenames.\n - replacement (str): The string to replace the pattern with.\n - directory (str): The directory in which to search for files.\n \n Output:\n - Returns a boolean value. True if the operation was successful, otherwise False.\n \n Requirements:\n - re\n - os\n\n Examples:\n >>> f_749('draft', 'final', '/home/user/documents')\n True\n >>> f_749('tmp', 'temp', '/home/user/downloads')\n False\n \"\"\"", "canonical_solution": " try:\n for file in os.listdir(directory):\n if re.search(pattern, file):\n new_filename = re.sub(pattern, replacement, file)\n os.rename(os.path.join(directory, file), os.path.join(directory, new_filename))\n return True\n except Exception as e:\n return False", "test": "import unittest\nimport tempfile\nimport shutil\nfrom pathlib import Path\nclass TestCases(unittest.TestCase):\n \n def setUp(self):\n self.test_dir = tempfile.mkdtemp()\n \n def tearDown(self):\n shutil.rmtree(self.test_dir)\n \n def create_test_files(self, filenames):\n for filename in filenames:\n Path(f\"{self.test_dir}/{filename}\").touch()\n \n def test_renaming_files(self):\n self.create_test_files([\"draft1.txt\", \"draft2.txt\", \"draft3.txt\"])\n result = f_749(\"draft\", \"final\", self.test_dir)\n self.assertTrue(result)\n expected_files = sorted([\"final1.txt\", \"final2.txt\", \"final3.txt\"])\n actual_files = sorted(os.listdir(self.test_dir))\n self.assertEqual(expected_files, actual_files)\n \n def test_no_matching_files(self):\n self.create_test_files([\"file1.txt\", \"file2.txt\", \"file3.txt\"])\n result = f_749(\"draft\", \"final\", self.test_dir)\n self.assertTrue(result)\n expected_files = sorted([\"file1.txt\", \"file2.txt\", \"file3.txt\"])\n actual_files = sorted(os.listdir(self.test_dir))\n self.assertEqual(expected_files, actual_files)\n \n def test_nonexistent_directory(self):\n result = f_749(\"draft\", \"final\", \"/nonexistent/directory\")\n self.assertFalse(result)\n \n def test_empty_directory(self):\n result = f_749(\"draft\", \"final\", self.test_dir)\n self.assertTrue(result)\n self.assertEqual([], os.listdir(self.test_dir))\n \n def test_complex_pattern_renaming(self):\n self.create_test_files([\"draft_file1.txt\", \"file_draft2.txt\", \"draft3file.txt\"])\n result = f_749(\"draft\", \"final\", self.test_dir)\n self.assertTrue(result)\n expected_files = sorted([\"final_file1.txt\", \"file_final2.txt\", \"final3file.txt\"])\n actual_files = sorted(os.listdir(self.test_dir))\n self.assertEqual(expected_files, actual_files)", "apis": ["re.search", "os.listdir", "os.rename", "re.sub", "os.path", "os.path.join"], "libs": ["re", "os"], "doc": {"description": ["Renames all files in a directory that match a particular pattern with a given replacement string.", "Functionality:", "- Searches for files in the specified directory whose names contain the given pattern.", "- Renames those files by replacing the pattern with the given replacement string.", "Input:", "- pattern (str): The pattern to search for in the filenames.", "- replacement (str): The string to replace the pattern with.", "- directory (str): The directory in which to search for files.", "Output:", "- Returns a boolean value. True if the operation was successful, otherwise False."], "note": [], "params": [], "returns": [], "reqs": ["re", "os"], "raises": [], "example": ["Examples:", ">>> f_749('draft', 'final', '/home/user/documents')", "True", ">>> f_749('tmp', 'temp', '/home/user/downloads')", "False"]}} -{"task_id": "f_922", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_922(arr):\n \"\"\"\n Analyzes the distribution of values in a NumPy array to determine if it is uniform and\n generates a histogram representing this distribution.\n\n Parameters:\n - arr (numpy.ndarray): A NumPy array containing the values to be analyzed. \n The array can contain any hashable data type (e.g., integers, floats, strings).\n\n Returns:\n - tuple: A tuple containing two elements:\n - uniform_distribution (bool): A boolean value indicating whether the distribution is uniform. \n - Returns True if every unique value in the array appears the same number of times,\n indicating a uniform distribution.\n - Returns False otherwise.\n - ax (matplotlib.axes.Axes): An Axes object displaying the histogram of the array's value distribution.\n - The histogram's bins correspond to the unique values in the array.\n - The frequency of each unique value is represented by the height of the corresponding bin.\n\n Note:\n - The bin is set to `np.arange(len(unique) + 1) - 0.5` to align each bin with its corresponding unique value.\n\n Requirements:\n - numpy\n - matplotlib\n\n Example:\n >>> arr = np.array([\"A\", \"A\", \"B\", \"B\"])\n >>> is_uniform, ax = f_922(arr)\n >>> is_uniform\n True\n \"\"\"", "canonical_solution": " unique, counts = np.unique(arr, return_counts=True)\n uniform_distribution = len(set(counts)) == 1\n\n _, ax = plt.subplots()\n ax.hist(arr, bins=np.arange(len(unique) + 1) - 0.5, rwidth=0.8, align=\"mid\")\n ax.set_xticks(range(len(unique)))\n ax.set_xticklabels(unique)\n\n return uniform_distribution, ax", "test": "import numpy as np\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_922\"\"\"\n def test_uniform_distribution(self):\n \"\"\"Test uniform distribution.\"\"\"\n arr = np.array([\"A\", \"A\", \"B\", \"B\"])\n uniform, _ = f_922(arr)\n self.assertTrue(uniform)\n def test_non_uniform_distribution(self):\n \"\"\"Test non-uniform distribution.\"\"\"\n arr = np.array([\"A\", \"A\", \"B\", \"B\", \"B\", \"C\", \"C\", \"C\", \"C\", \"D\", \"E\", \"E\"])\n uniform, _ = f_922(arr)\n self.assertFalse(uniform)\n def test_single_value(self):\n \"\"\"Test single value.\"\"\"\n arr = np.array([\"A\", \"A\", \"A\", \"A\"])\n uniform, _ = f_922(arr)\n self.assertTrue(uniform)\n def test_multiple_equal_values(self):\n \"\"\"Test multiple equal values.\"\"\"\n arr = np.array([\"A\", \"A\", \"B\", \"B\", \"C\", \"C\", \"D\", \"D\"])\n uniform, _ = f_922(arr)\n self.assertTrue(uniform)\n def test_varying_values(self):\n \"\"\"Test varying values.\"\"\"\n arr = np.array([\"A\", \"B\", \"B\", \"C\", \"C\", \"C\", \"D\", \"D\", \"D\", \"D\"])\n uniform, _ = f_922(arr)\n self.assertFalse(uniform)\n def tearDown(self):\n plt.close()", "apis": ["numpy.arange", "numpy.unique", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Analyzes the distribution of values in a NumPy array to determine if it is uniform and", "generates a histogram representing this distribution."], "note": ["The bin is set to `np.arange(len(unique) + 1) - 0.5` to align each bin with its corresponding unique value."], "params": ["arr (numpy.ndarray): A NumPy array containing the values to be analyzed.", "The array can contain any hashable data type (e.g., integers, floats, strings)."], "returns": ["tuple: A tuple containing two elements:", "uniform_distribution (bool): A boolean value indicating whether the distribution is uniform.", "Returns True if every unique value in the array appears the same number of times,", "indicating a uniform distribution.", "Returns False otherwise.", "ax (matplotlib.axes.Axes): An Axes object displaying the histogram of the array's value distribution.", "The histogram's bins correspond to the unique values in the array.", "The frequency of each unique value is represented by the height of the corresponding bin."], "reqs": ["numpy", "matplotlib"], "raises": [], "example": [">>> arr = np.array([\"A\", \"A\", \"B\", \"B\"])", ">>> is_uniform, ax = f_922(arr)", ">>> is_uniform", "True"]}} -{"task_id": "f_920", "prompt": "from datetime import datetime\nimport pandas as pd\n\n# For Python versions lower than 3.9, use 'pytz' instead of 'zoneinfo'\ntry:\n from zoneinfo import ZoneInfo\nexcept ImportError:\n from pytz import timezone as ZoneInfo\n\nTIME_FORMAT = \"%d/%m/%y %H:%M:%S.%f\"\n\ndef f_920(time_strings, target_tz):\n \"\"\"\n Convert a list of time strings from UTC to a specified timezone and return a DataFrame.\n\n The function processes each UTC time string in the given list,\n converts it to the specified timezone, and stores the results in a DataFrame.\n\n Parameters:\n - time_strings (list of str): A list of time strings in UTC. Each string should be formatted as 'dd/mm/yy HH:MM:SS.fff'.\n - target_tz (str): The timezone identifier (e.g., 'America/New_York') to which the time strings should be converted.\n\n Returns:\n - pandas.DataFrame: A DataFrame with two columns: 'Original Time'\n containing the UTC times and 'Converted Time' containing the times converted to the target timezone.\n\n Requirements:\n - pandas\n - datetime\n - zoneinfo.ZoneInfo (Python 3.9+) or pytz.timezone.ZoneInfo (Python < 3.9)\n \n Note:\n - The function assumes that the input times are in UTC.\n\n Example:\n >>> time_strings = ['30/03/09 16:31:32.123', '15/04/10 14:25:46.789', '20/12/11 12:34:56.000']\n >>> df = f_920(time_strings, 'America/New_York')\n >>> print(df)\n Original Time Converted Time\n 0 30/03/09 16:31:32.123 30/03/09 12:31:32.123000\n 1 15/04/10 14:25:46.789 15/04/10 10:25:46.789000\n 2 20/12/11 12:34:56.000 20/12/11 07:34:56.000000\n \"\"\"", "canonical_solution": " data = []\n\n for time_string in time_strings:\n utc_time = datetime.strptime(time_string, TIME_FORMAT)\n converted_time = utc_time.replace(tzinfo=ZoneInfo(\"UTC\")).astimezone(\n ZoneInfo(target_tz)\n )\n data.append([time_string, converted_time.strftime(TIME_FORMAT)])\n\n df = pd.DataFrame(data, columns=[\"Original Time\", \"Converted Time\"])\n return df", "test": "import unittest\ntry:\n from zoneinfo import ZoneInfo\nexcept ImportError:\n from pytz import timezone as ZoneInfo\n# Test cases\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_920\"\"\"\n def test_conversion_from_utc(self):\n \"\"\"Test conversion from UTC to Eastern Standard Time.\"\"\"\n time_strings = [\"01/01/21 00:00:00.000\", \"01/01/21 12:00:00.000\"]\n df = f_920(time_strings, \"America/New_York\")\n expected = [\"31/12/20 19:00:00.000000\", \"01/01/21 07:00:00.000000\"]\n self.assertEqual(list(df[\"Converted Time\"]), expected)\n def test_conversion_from_non_utc(self):\n \"\"\"Test conversion from Eastern Standard Time to India Standard Time.\"\"\"\n time_strings = [\"01/01/21 00:00:00.000\", \"01/01/21 12:00:00.000\"]\n df = f_920(time_strings, \"Asia/Kolkata\")\n expected = [\"01/01/21 05:30:00.000000\", \"01/01/21 17:30:00.000000\"]\n self.assertEqual(list(df[\"Converted Time\"]), expected)\n def test_empty_list(self):\n \"\"\"Test empty list.\"\"\"\n df = f_920([], \"America/New_York\")\n self.assertEqual(len(df), 0)\n def test_invalid_time_string(self):\n \"\"\"Test invalid time string.\"\"\"\n with self.assertRaises(ValueError):\n f_920([\"invalid_time_string\"], \"America/New_York\")\n def test_non_standard_time_format(self):\n \"\"\"Test handling of non-standard time format.\"\"\"\n time_strings = [\"2021-01-01 00:00:00\"]\n with self.assertRaises(ValueError):\n f_920(time_strings, \"America/New_York\")", "apis": ["pandas.DataFrame", "datetime.datetime.strptime", "pytz.timezone"], "libs": ["pandas", "datetime", "pytz"], "doc": {"description": ["Convert a list of time strings from UTC to a specified timezone and return a DataFrame.", "The function processes each UTC time string in the given list,", "converts it to the specified timezone, and stores the results in a DataFrame."], "note": ["The function assumes that the input times are in UTC."], "params": ["time_strings (list of str): A list of time strings in UTC. Each string should be formatted as 'dd/mm/yy HH:MM:SS.fff'.", "target_tz (str): The timezone identifier (e.g., 'America/New_York') to which the time strings should be converted."], "returns": ["pandas.DataFrame: A DataFrame with two columns: 'Original Time'", "containing the UTC times and 'Converted Time' containing the times converted to the target timezone."], "reqs": ["pandas", "datetime", "zoneinfo.ZoneInfo (Python 3.9+) or pytz.timezone.ZoneInfo (Python < 3.9)"], "raises": [], "example": [">>> time_strings = ['30/03/09 16:31:32.123', '15/04/10 14:25:46.789', '20/12/11 12:34:56.000']", ">>> df = f_920(time_strings, 'America/New_York')", ">>> print(df)", "Original Time Converted Time", "0 30/03/09 16:31:32.123 30/03/09 12:31:32.123000", "1 15/04/10 14:25:46.789 15/04/10 10:25:46.789000", "2 20/12/11 12:34:56.000 20/12/11 07:34:56.000000"]}} -{"task_id": "f_926", "prompt": "import pandas as pd\nfrom scipy.stats import pearsonr\n\n\ndef f_926(data):\n \"\"\"\n Calculates the Pearson correlation coefficient between numerical scores and categorical grades.\n\n This function performs three main tasks:\n 1. Converts scores from string format to floats.\n 2. Encodes categorical grades into numerical values based on their rank order.\n 3. Computes the Pearson correlation coefficient between the numerical scores and the encoded grades.\n\n Parameters:\n - data (dict): A dictionary containing two keys:\n - 'Score_String': A list of scores in string format.\n - 'Grade': A list of corresponding grades in string format.\n Each list under these keys must have the same length.\n\n Returns:\n - correlation (float): The Pearson correlation coefficient between the converted numerical scores and encoded grades.\n Returns NaN if the input data frame has less than 2 rows, as the correlation coefficient cannot be calculated in this case.\n\n Requirements:\n - pandas\n - scipy\n\n Example:\n >>> f_926({'Score_String': ['80.5', '85.7', '90.2'], 'Grade': ['B', 'B+', 'A-']})\n -0.46351538587606683\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n if len(df) < 2: # Check if the data frame has less than 2 rows\n return float(\"nan\") # or return None\n\n df[\"Score_Float\"] = df[\"Score_String\"].astype(float)\n df[\"Grade_Encoded\"] = df[\"Grade\"].astype(\"category\").cat.codes\n correlation = pearsonr(df[\"Score_Float\"], df[\"Grade_Encoded\"])[0]\n return correlation", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_926\"\"\"\n def test_normal_operation(self):\n \"\"\"\n Test normal operation with valid input.\n \"\"\"\n data = {\"Score_String\": [\"80.5\", \"85.7\", \"90.2\"], \"Grade\": [\"B\", \"B+\", \"A-\"]}\n result = f_926(data)\n self.assertIsInstance(result, float)\n def test_empty_input(self):\n \"\"\"\n Test the function with empty input.\n \"\"\"\n data = {\"Score_String\": [], \"Grade\": []}\n result = f_926(data)\n self.assertTrue(pd.isna(result))\n def test_invalid_score_format(self):\n \"\"\"\n Test the function with invalid score format.\n \"\"\"\n data = {\"Score_String\": [\"eighty\", \"85.7\", \"90.2\"], \"Grade\": [\"B\", \"B+\", \"A-\"]}\n with self.assertRaises(ValueError):\n f_926(data)\n def test_mismatched_lengths(self):\n \"\"\"\n Test the function with mismatched lengths of scores and grades.\n \"\"\"\n data = {\"Score_String\": [\"80.5\", \"85.7\"], \"Grade\": [\"B\", \"B+\", \"A-\"]}\n with self.assertRaises(ValueError):\n f_926(data)\n def test_non_ordinal_grades(self):\n \"\"\"\n Test the function with non-ordinal grade inputs.\n \"\"\"\n data = {\n \"Score_String\": [\"80.5\", \"85.7\", \"90.2\"],\n \"Grade\": [\"Pass\", \"Fail\", \"Pass\"],\n }\n result = f_926(data)\n self.assertIsInstance(result, float)", "apis": ["scipy.stats.pearsonr", "pandas.DataFrame"], "libs": ["pandas", "scipy"], "doc": {"description": ["Calculates the Pearson correlation coefficient between numerical scores and categorical grades.", "This function performs three main tasks:", "1. Converts scores from string format to floats.", "2. Encodes categorical grades into numerical values based on their rank order.", "3. Computes the Pearson correlation coefficient between the numerical scores and the encoded grades."], "note": [], "params": ["data (dict): A dictionary containing two keys:", "'Score_String': A list of scores in string format.", "'Grade': A list of corresponding grades in string format.", "Each list under these keys must have the same length."], "returns": ["correlation (float): The Pearson correlation coefficient between the converted numerical scores and encoded grades.", "Returns NaN if the input data frame has less than 2 rows, as the correlation coefficient cannot be calculated in this case."], "reqs": ["pandas", "scipy"], "raises": [], "example": [">>> f_926({'Score_String': ['80.5', '85.7', '90.2'], 'Grade': ['B', 'B+', 'A-']})", "-0.46351538587606683"]}} -{"task_id": "f_797", "prompt": "import random\nimport re\n\n\ndef f_797(target_words, n_sentences, vocabulary):\n \"\"\"\n Generate sentences with spaces in certain target words replaced by underscores.\n\n Parameters:\n - target_words (list of str): List of words/phrases where spaces should be replaced with underscores.\n - n_sentences (int): Number of sentences to generate. Must not be negative.\n - vocabulary (list of str): List of words to use for generating sentences. Must not be empty.\n\n Returns:\n - list of str: A list of generated sentences in all lowercase, with specified words/phrases underscored.\n\n Raises:\n - ValueError: If n_sentences is negative or if the vocabulary is empty.\n\n Requirements:\n - random\n - re\n\n Notes:\n - Each sentence is generated by randomly sampling 10 words with replacement from a vocabulary,\n then concatenating with a single whitespace. Then, if any words from the target_words list\n appear in these sentences, spaces within those words are replaced with underscores; here the\n modification is insensitive to the case of the letters.\n - The function returns the processed sentences as a list of all lowercase strings.\n\n Examples:\n >>> random.seed(42)\n >>> f_797(['apple banana'], 1, ['apple', 'banana', 'cherry'])\n ['banana apple apple apple cherry cherry cherry apple_banana apple']\n >>> f_797(['Alice Charlie', 'ALICE BOB', 'aLiCe dAn'], 1, ['alice', 'bob', 'charlie', 'dan'])\n ['alice_charlie alice alice_charlie charlie alice_charlie dan alice']\n \"\"\"", "canonical_solution": " if n_sentences < 0:\n raise ValueError(\"n_sentences cannot be negative.\")\n if not vocabulary:\n raise ValueError(\"Vocabulary cannot be empty.\")\n\n sentences = []\n for _ in range(n_sentences):\n sentence = \" \".join(random.choices(vocabulary, k=10))\n for word in target_words:\n pattern = re.compile(re.escape(word), re.IGNORECASE)\n sentence = pattern.sub(word.replace(\" \", \"_\"), sentence)\n sentences.append(sentence.lower())\n return sentences", "test": "import unittest\nimport random\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.vocabulary = [\n \"apple\",\n \"banana\",\n \"cherry\",\n \"date\",\n \"elderberry\",\n \"fig\",\n \"grape\",\n \"honeydew\",\n ]\n random.seed(42)\n def test_case_1(self):\n # Test with multiple target words and sentences\n target_words = [\"apple banana\", \"banana cherry\"]\n n_sentences = 1000\n results = f_797(target_words, n_sentences, [\"apple\", \"banana\", \"cherry\"])\n self.assertEqual(len(results), n_sentences)\n for target in target_words:\n underscored_target = target.replace(\" \", \"_\")\n self.assertTrue(\n any(underscored_target in sentence for sentence in results),\n f\"{underscored_target} not found in any sentences\",\n )\n def test_case_2(self):\n # Test with a single target word in multiple occurrences\n target_words = [\"apple\"]\n n_sentences = 1\n results = f_797(target_words, n_sentences, [\"apple\"] * 10)\n self.assertEqual(len(results), n_sentences)\n self.assertTrue(\n results[0].count(\"apple\") > 1,\n \"Multiple 'apple' occurrences not replaced correctly\",\n )\n def test_case_3(self):\n # Test with no target words\n target_words = []\n n_sentences = 1\n results = f_797(target_words, n_sentences, self.vocabulary)\n self.assertEqual(len(results), n_sentences)\n self.assertTrue(all(\" \" in sentence for sentence in results), \"\")\n def test_case_4(self):\n # Test case sensitivity\n target_words = [\"Apple Banana\"]\n n_sentences = 2\n results = f_797(target_words, n_sentences, self.vocabulary + [\"apple banana\"])\n self.assertEqual(len(results), n_sentences)\n for result in results:\n self.assertIn(\n \"apple_banana\", result, \"Case sensitivity not handled properly\"\n )\n def test_case_5(self):\n # Test generating zero sentences\n target_words = [\"apple\"]\n n_sentences = 0\n results = f_797(target_words, n_sentences, self.vocabulary)\n self.assertEqual(len(results), n_sentences, \"No sentences should be generated\")\n def test_case_6(self):\n # Test function handling invalid inputs - vocabulary\n target_words = [\"apple\"]\n n_sentences = 1\n with self.assertRaises(ValueError):\n f_797(target_words, n_sentences, [])\n def test_case_7(self):\n # Test function handling invalid inputs - n_sentences\n target_words = [\"apple\"]\n with self.assertRaises(ValueError):\n f_797(target_words, -1, self.vocabulary)\n with self.assertRaises(TypeError):\n f_797(target_words, 1.0, self.vocabulary)\n def test_case_8(self):\n # Test whitespace target word\n target_words = [\" \"]\n n_sentences = 1\n results = f_797(target_words, n_sentences, [\"apple banana\", \"cherry\"])\n assert len(results[0].split(\"_\")) >= 10\n def test_case_9(self):\n # Test target word not in vocabulary\n target_words = [\"mango\"]\n n_sentences = 2\n results = f_797(target_words, n_sentences, [\"apple\", \"banana\", \"cherry\"])\n for sentence in results:\n self.assertNotIn(\n \"mango\",\n sentence,\n \"Target word not in vocabulary should not appear in sentences.\",\n )", "apis": ["re.escape", "re.IGNORECASE", "re.compile", "random.choices"], "libs": ["re", "random"], "doc": {"description": ["Generate sentences with spaces in certain target words replaced by underscores.", "Notes:", "- Each sentence is generated by randomly sampling 10 words with replacement from a vocabulary,", "then concatenating with a single whitespace. Then, if any words from the target_words list", "appear in these sentences, spaces within those words are replaced with underscores; here the", "modification is insensitive to the case of the letters.", "- The function returns the processed sentences as a list of all lowercase strings."], "note": [], "params": ["target_words (list of str): List of words/phrases where spaces should be replaced with underscores.", "n_sentences (int): Number of sentences to generate. Must not be negative.", "vocabulary (list of str): List of words to use for generating sentences. Must not be empty."], "returns": ["list of str: A list of generated sentences in all lowercase, with specified words/phrases underscored."], "reqs": ["random", "re"], "raises": ["ValueError: If n_sentences is negative or if the vocabulary is empty."], "example": ["Examples:", ">>> random.seed(42)", ">>> f_797(['apple banana'], 1, ['apple', 'banana', 'cherry'])", "['banana apple apple apple cherry cherry cherry apple_banana apple']", ">>> f_797(['Alice Charlie', 'ALICE BOB', 'aLiCe dAn'], 1, ['alice', 'bob', 'charlie', 'dan'])", "['alice_charlie alice alice_charlie charlie alice_charlie dan alice']"]}} -{"task_id": "f_814", "prompt": "import os\nfrom pathlib import Path\nfrom datetime import datetime, timezone\n\n\ndef f_814(directory_path: str):\n \"\"\"\n Analyzes a given directory, listing each file it contains along with its size,\n creation time, and last modification time without recursing into subdirectories.\n\n Args:\n - directory_path (str): The path to the directory to be analyzed.\n If it is empty, this function returns an empty list.\n\n Returns:\n - list of tuples: Each tuple contains (file name, file size in bytes,\n creation time in ISO format, modification time in ISO format).\n\n Raises:\n - ValueError: If the provided directory does not exist.\n\n Requirements:\n - os\n - pathlib\n - datetime\n\n Notes:\n - The function assumes the directory exists and contains only files (no\n subdirectories are processed).\n - Times are reported in system time, UTC.\n - The creation and modification times are platform dependent; on some systems,\n the creation time might not be available and might be replaced by the last\n metadata change time.\n\n Examples:\n >>> result = f_814('/path/to/directory')\n >>> print(result)\n [('example.txt', 1024, '2023-04-01T14:30:00Z', '2023-04-02T15:00:00Z'), ...]\n\n >>> result = f_814('/path/to/empty_directory')\n >>> print(result)\n []\n \"\"\"", "canonical_solution": " if not Path(directory_path).is_dir():\n raise ValueError(f\"The path {directory_path} is not a valid directory.\")\n\n file_details = []\n for entry in os.scandir(directory_path):\n if entry.is_file():\n file_info = os.stat(entry.path)\n file_size = file_info.st_size\n creation_time = datetime.fromtimestamp(\n file_info.st_ctime, timezone.utc\n ).isoformat()\n modification_time = datetime.fromtimestamp(\n file_info.st_mtime, timezone.utc\n ).isoformat()\n file_details.append(\n (entry.name, file_size, creation_time, modification_time)\n )\n\n return file_details", "test": "import unittest\nimport tempfile\nimport os\nfrom datetime import datetime, timezone, timedelta\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Set up a 'before' time with leeway for testing file modification times\n self.before_creation = datetime.now(timezone.utc) - timedelta(seconds=1)\n # Setup a temporary directory\n self.test_dir = tempfile.TemporaryDirectory()\n # Create test files\n self.files = {\n \"empty.txt\": 0,\n \"small.txt\": 5,\n \"medium.txt\": 50,\n \"large.txt\": 500,\n \"utc_test.txt\": 10,\n }\n for file_name, size in self.files.items():\n path = os.path.join(self.test_dir.name, file_name)\n with open(path, \"wb\") as f:\n f.write(os.urandom(size))\n def tearDown(self):\n # Cleanup the directory after tests\n self.test_dir.cleanup()\n def test_case_1(self):\n # Test the function on an existing directory.\n result = f_814(self.test_dir.name)\n self.assertEqual(len(result), len(self.files))\n def test_case_2(self):\n # Test the function with a non-existing directory.\n with self.assertRaises(ValueError):\n f_814(\"/path/to/non/existing/directory\")\n def test_case_3(self):\n # Test the function with an empty directory.\n with tempfile.TemporaryDirectory() as empty_dir:\n result = f_814(empty_dir)\n self.assertEqual(len(result), 0)\n def test_case_4(self):\n # Test if the function correctly identifies file sizes.\n result = f_814(self.test_dir.name)\n sizes = {file[0]: file[1] for file in result}\n for file_name, size in self.files.items():\n self.assertEqual(sizes[file_name], size)\n def test_case_5(self):\n # Test if the function lists all expected files, regardless of order.\n result = f_814(self.test_dir.name)\n file_names = sorted([file[0] for file in result])\n expected_file_names = sorted(\n list(self.files.keys())\n ) # Assuming 'utc_test.txt' is expected.\n self.assertListEqual(file_names, expected_file_names)\n def test_case_6(self):\n # Test if modification times are correctly identified.\n result = f_814(self.test_dir.name)\n # Check if modification times are reasonable (not testing specific times because of system differences)\n for _, _, creation_time, modification_time in result:\n creation_datetime = datetime.fromisoformat(creation_time)\n modification_datetime = datetime.fromisoformat(modification_time)\n self.assertTrue(creation_datetime <= modification_datetime)\n def test_case_7(self):\n # Test that the function ignores directories.\n sub_dir_path = os.path.join(self.test_dir.name, \"subdir\")\n os.mkdir(sub_dir_path)\n # Add a file inside the sub-directory to ensure it's not empty\n with open(os.path.join(sub_dir_path, \"file.txt\"), \"w\") as sub_file:\n sub_file.write(\"This is a test.\")\n result = f_814(self.test_dir.name)\n self.assertEqual(\n len(result), len(self.files)\n ) # Should not count the subdir or its contents\n def test_case_8(self):\n # Test if file names are correctly identified.\n result = f_814(self.test_dir.name)\n names = [file[0] for file in result]\n for name in self.files.keys():\n self.assertIn(name, names)\n def test_case_9(self):\n # Test that a non-directory path raises a ValueError.\n with tempfile.NamedTemporaryFile() as tmpfile:\n with self.assertRaises(ValueError):\n f_814(tmpfile.name)\n def test_case_10(self):\n # Test timestamps are in UTC and within a reasonable accuracy window.\n self.after_creation = datetime.now(timezone.utc)\n result = f_814(self.test_dir.name)\n for _, _, creation_time, modification_time in result:\n creation_dt = datetime.fromisoformat(creation_time)\n modification_dt = datetime.fromisoformat(modification_time)\n # Ensure the timestamps are in UTC\n self.assertEqual(creation_dt.tzinfo, timezone.utc)\n self.assertEqual(modification_dt.tzinfo, timezone.utc)\n # Ensure timestamps are within a reasonable window\n self.assertTrue(self.before_creation <= creation_dt <= self.after_creation)\n self.assertTrue(\n self.before_creation <= modification_dt <= self.after_creation\n )", "apis": ["os.scandir", "datetime.datetime.fromtimestamp", "datetime.timezone.utc", "pathlib.Path", "os.stat"], "libs": ["datetime", "os", "pathlib"], "doc": {"description": ["Analyzes a given directory, listing each file it contains along with its size,", "creation time, and last modification time without recursing into subdirectories.", "Args:", "- directory_path (str): The path to the directory to be analyzed.", "If it is empty, this function returns an empty list.", "Notes:", "- The function assumes the directory exists and contains only files (no", "subdirectories are processed).", "- Times are reported in system time, UTC.", "- The creation and modification times are platform dependent; on some systems,", "the creation time might not be available and might be replaced by the last", "metadata change time.", ">>> result = f_814('/path/to/empty_directory')", ">>> print(result)", "[]"], "note": [], "params": [], "returns": ["list of tuples: Each tuple contains (file name, file size in bytes,", "creation time in ISO format, modification time in ISO format)."], "reqs": ["os", "pathlib", "datetime"], "raises": ["ValueError: If the provided directory does not exist."], "example": ["Examples:", ">>> result = f_814('/path/to/directory')", ">>> print(result)", "[('example.txt', 1024, '2023-04-01T14:30:00Z', '2023-04-02T15:00:00Z'), ...]"]}} -{"task_id": "f_835", "prompt": "import sys\nimport sqlite3\n\n# Constants\nPATH_TO_APPEND = \"path/to/whatever\"\nDATABASE = \"path/to/database.db\"\n\n\ndef f_835(path_to_append=PATH_TO_APPEND, database=DATABASE):\n \"\"\"\n This function appends a given path to sys.path and updates an SQLite database with the path, \n creating the table if needed and avoiding duplicates.\n\n Parameters:\n - path_to_append (str): A file system path to be appended to sys.path and inserted\n into the SQLite database. Defaults to 'path/to/whatever' if not specified.\n - database (str): The file system path to the SQLite database file. Defaults to\n 'path/to/database.db' if not provided. The function interacts with this database\n to store the path.\n\n Returns:\n - str: The path that was appended to sys.path and inserted into the database.\n\n Requirements:\n - sys\n - sqlite3\n\n\n Examples:\n >>> f_835('path/to/new_directory', 'path/to/new_database.db')\n 'path/to/new_directory'\n >>> f_835()\n 'path/to/whatever'\n \"\"\"", "canonical_solution": " sys.path.append(path_to_append)\n\n conn = sqlite3.connect(database)\n cur = conn.cursor()\n cur.execute(\"CREATE TABLE IF NOT EXISTS paths (path TEXT UNIQUE)\")\n cur.execute(\"INSERT OR IGNORE INTO paths (path) VALUES (?)\", (path_to_append,))\n conn.commit()\n conn.close()\n\n return path_to_append", "test": "import unittest\nimport sqlite3\nimport os\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_835\"\"\"\n def setUp(self):\n path_to_create = os.path.dirname(PATH_TO_APPEND)\n os.makedirs(path_to_create, exist_ok=True)\n self.test_db = DATABASE\n def test_basic_path_insertion(self):\n \"\"\"Test the function when a path is provided.\"\"\"\n test_path = \"path/to/test/path\"\n result = f_835(test_path, self.test_db)\n self.assertEqual(result, test_path)\n # Check the database to ensure the path was saved\n conn = sqlite3.connect(self.test_db)\n cur = conn.cursor()\n cur.execute(\"SELECT * FROM paths WHERE path=?\", (test_path,))\n fetched_path = cur.fetchone()\n conn.close()\n self.assertIsNotNone(fetched_path)\n self.assertEqual(fetched_path[0], test_path)\n def test_existing_path(self):\n \"\"\"Test the function when an existing path is provided.\"\"\"\n # Insert an existing path\n existing_path = \"existing/path\"\n f_835(existing_path, self.test_db)\n # Attempt to insert the same path again\n result = f_835(existing_path, self.test_db)\n self.assertEqual(result, existing_path)\n # Check the database to ensure there's only one entry for the existing path\n conn = sqlite3.connect(self.test_db)\n cur = conn.cursor()\n cur.execute(\"SELECT COUNT(*) FROM paths WHERE path=?\", (existing_path,))\n count = cur.fetchone()[0]\n conn.close()\n self.assertEqual(count, 1)\n def test_multiple_paths(self):\n \"\"\"Test the function when multiple paths are provided.\"\"\"\n paths = [\"path1\", \"path2\", \"path3\"]\n for path in paths:\n result = f_835(path, self.test_db)\n self.assertEqual(result, path)\n # Check the database to ensure all paths are saved\n conn = sqlite3.connect(self.test_db)\n cur = conn.cursor()\n cur.execute(\"SELECT COUNT(*) FROM paths\")\n count = cur.fetchone()[0]\n conn.close()\n self.assertEqual(count, len(paths))\n def test_database_creation(self):\n \"\"\"Test the function when the database doesn't exist.\"\"\"\n new_db = \"path/to/new_test_database.db\"\n test_path = \"path/to/new\"\n os.makedirs(os.path.dirname(test_path), exist_ok=True)\n result = f_835(test_path, new_db)\n self.assertEqual(result, test_path)\n # Check the new database to ensure the path was saved\n conn = sqlite3.connect(new_db)\n cur = conn.cursor()\n cur.execute(\"SELECT * FROM paths WHERE path=?\", (test_path,))\n fetched_path = cur.fetchone()\n conn.close()\n self.assertIsNotNone(fetched_path)\n self.assertEqual(fetched_path[0], test_path)\n def test_invalid_database(self):\n \"\"\"Test the function when an invalid database is provided.\"\"\"\n invalid_db = \"invalid/path/database.db\"\n test_path = \"test/path\"\n with self.assertRaises(sqlite3.OperationalError):\n f_835(test_path, invalid_db)\n def tearDown(self):\n # Cleanup the test databases\n dbs_to_remove = [\"path/to/database.db\", \"path/to/new_test_database.db\"]\n for db in dbs_to_remove:\n if os.path.exists(db):\n os.remove(db)\n # Cleanup the test directories\n dirs_to_remove = [\"path/to/whatever\", \"path/to\", \"path\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["sys.path.append", "sys.path", "sqlite3.connect"], "libs": ["sqlite3", "sys"], "doc": {"description": ["This function appends a given path to sys.path and updates an SQLite database with the path,", "creating the table if needed and avoiding duplicates."], "note": [], "params": ["path_to_append (str): A file system path to be appended to sys.path and inserted", "into the SQLite database. Defaults to 'path/to/whatever' if not specified.", "database (str): The file system path to the SQLite database file. Defaults to", "'path/to/database.db' if not provided. The function interacts with this database", "to store the path."], "returns": ["str: The path that was appended to sys.path and inserted into the database."], "reqs": ["sys", "sqlite3"], "raises": [], "example": ["Examples:", ">>> f_835('path/to/new_directory', 'path/to/new_database.db')", "'path/to/new_directory'", ">>> f_835()", "'path/to/whatever'"]}} +{"task_id": "f_424", "prompt": "import sqlite3\nimport pandas as pd\n\n\ndef f_424(db_name, table_name):\n \"\"\"\n Plot the relationship between the first and second numerical columns of an SQLite3 table.\n\n Parameters:\n - db_name (str): The absolute path to the SQLite3 database.\n - table_name (str): The name of the table to plot from.\n\n Returns:\n - matplotlib.axes._axes.Axes: Scatterplot with column name labeled on their respective axes.\n\n Requirements:\n - sqlite3\n - pandas\n\n Example:\n >>> ax = f_424('/path/to/database/test.db', 'People')\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0.9400000000000001, 0, '0.94'), ... ]\n \"\"\"", "canonical_solution": " # Connect to the SQLite database\n conn = sqlite3.connect(db_name)\n\n # Dynamically get the first two numerical columns from the table (excluding 'id')\n df = pd.read_sql_query(f\"SELECT * from {table_name}\", conn)\n numerical_columns = df.select_dtypes(include=[\"float64\", \"int64\"]).columns.tolist()\n if \"id\" in numerical_columns:\n numerical_columns.remove(\"id\")\n if len(numerical_columns) < 2:\n raise ValueError(\"The table must have at least two numerical columns to plot.\")\n\n # Plot the relationship between the two columns\n ax = df.plot.scatter(x=numerical_columns[0], y=numerical_columns[1])\n return ax", "test": "import unittest\nimport sqlite3\nimport os\nimport matplotlib.pyplot as plt\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.test_db_path = os.path.join(self.temp_dir.name, \"test.db\")\n self.another_test_db_path = os.path.join(self.temp_dir.name, \"another_test.db\")\n self.nonexistent_db_path = os.path.join(self.temp_dir.name, \"nonexistent.db\")\n # Setup for 'test.db'\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n \"CREATE TABLE People (id INTEGER PRIMARY KEY, name TEXT, age INTEGER, height REAL)\"\n )\n self.data = [\n (\"Alice\", 25, 5.5),\n (\"Bob\", 30, 6.0),\n (\"Charlie\", 35, 5.8),\n (\"David\", 40, 6.2),\n (\"Eve\", 45, 5.9),\n (\"Frank\", 50, 5.6),\n ]\n cur.executemany(\n \"INSERT INTO People (name, age, height) VALUES (?, ?, ?)\", self.data\n )\n # Setup for 'another_test.db'\n with sqlite3.connect(self.another_test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n \"CREATE TABLE Animals (id INTEGER PRIMARY KEY, name TEXT, lifespan INTEGER, weight REAL)\"\n )\n animal_data = [\n (\"Dog\", 13, 30.0),\n (\"Cat\", 15, 4.5),\n (\"Elephant\", 70, 6000.0),\n (\"Dolphin\", 20, 150.0),\n ]\n cur.executemany(\n \"INSERT INTO Animals (name, lifespan, weight) VALUES (?, ?, ?)\",\n animal_data,\n )\n def tearDown(self):\n self.temp_dir.cleanup()\n plt.close(\"all\")\n def test_case_1(self):\n # Test basic functionality\n ax = f_424(self.test_db_path, \"People\")\n self.assertEqual(ax.get_xlabel(), \"age\")\n self.assertEqual(ax.get_ylabel(), \"height\")\n self.assertEqual(len(ax.collections[0].get_offsets()), 6)\n def test_case_2(self):\n # Test handling non-existent table\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"NonExistentTable\")\n def test_case_3(self):\n # Test handling non-existent db\n with self.assertRaises(Exception):\n f_424(self.nonexistent_db_path, \"People\")\n def test_case_4(self):\n # Table with removed numerical column should raise error\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n f\"CREATE TABLE temp AS SELECT id, name, age FROM People WHERE name IN ('Alice', 'Bob')\"\n )\n cur.execute(f\"DROP TABLE People\")\n cur.execute(f\"ALTER TABLE temp RENAME TO People\")\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"People\")\n # Revert changes\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(f\"CREATE TABLE temp AS SELECT * FROM People\")\n cur.execute(f\"DROP TABLE People\")\n cur.execute(\n f\"CREATE TABLE People (id INTEGER PRIMARY KEY, name TEXT, age INTEGER, height REAL)\"\n )\n cur.executemany(\n f\"INSERT INTO People (name, age, height) VALUES (?, ?, ?)\", self.data\n )\n def test_case_5(self):\n # Test another set of data/db\n ax = f_424(self.another_test_db_path, \"Animals\")\n self.assertEqual(ax.get_xlabel(), \"lifespan\")\n self.assertEqual(ax.get_ylabel(), \"weight\")\n self.assertEqual(len(ax.collections[0].get_offsets()), 4)\n def test_case_6(self):\n # Test handling of a table with only one numerical column\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n \"CREATE TABLE SingleNumCol (id INTEGER PRIMARY KEY, name TEXT, age INTEGER)\"\n )\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"SingleNumCol\")\n def test_case_7(self):\n # Test handling of a table with no numerical columns\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\n \"CREATE TABLE NoNumCols (id INTEGER PRIMARY KEY, name TEXT, description TEXT)\"\n )\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"NoNumCols\")\n def test_case_8(self):\n # Test a table where 'id' is the only numerical column\n with sqlite3.connect(self.test_db_path) as conn:\n cur = conn.cursor()\n cur.execute(\"CREATE TABLE OnlyIDNum (id INTEGER PRIMARY KEY, name TEXT)\")\n with self.assertRaises(Exception):\n f_424(self.test_db_path, \"OnlyIDNum\")\n def test_case_9(self):\n # Test plotting when the first two numerical columns are not 'id', 'age', or 'height'\n with sqlite3.connect(self.another_test_db_path) as conn:\n cur = conn.cursor()\n custom_data = [(\"Lion\", 15, 190.5), (\"Tiger\", 20, 220.0)]\n cur.executemany(\n \"INSERT INTO Animals (name, lifespan, weight) VALUES (?, ?, ?)\",\n custom_data,\n )\n ax = f_424(self.another_test_db_path, \"Animals\")\n self.assertEqual(ax.get_xlabel(), \"lifespan\")\n self.assertEqual(ax.get_ylabel(), \"weight\")\n self.assertGreaterEqual(len(ax.collections[0].get_offsets()), 2)", "apis": ["sqlite3.connect", "pandas.read_sql_query"], "libs": ["pandas", "sqlite3"], "doc": {"description": ["Plot the relationship between the first and second numerical columns of an SQLite3 table."], "note": [], "params": ["db_name (str): The absolute path to the SQLite3 database.", "table_name (str): The name of the table to plot from."], "returns": ["matplotlib.axes._axes.Axes: Scatterplot with column name labeled on their respective axes."], "reqs": ["sqlite3", "pandas"], "raises": [], "example": [">>> ax = f_424('/path/to/database/test.db', 'People')", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0.9400000000000001, 0, '0.94'), ... ]"]}} +{"task_id": "f_845", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\ndef f_845(data, column_name=\"target_column\"):\n \"\"\"\n Converts a given JSON data into a Pandas DataFrame and plots a histogram of a specified column.\n The function handles non-numeric columns by converting them to categorical type and then to numeric codes. \n It also checks if the specified column exists in the DataFrame.\n\n - The histogram's title is set to 'Histogram of '.\n - The histogram's x-label are set to the name of the specified column.\n \n Parameters:\n - data (list of dict)\n - column_name (str, optional)\n\n Returns:\n - DataFrame: A pandas DataFrame created from the input JSON data.\n - Axes: A matplotlib Axes object showing the histogram plot of the specified column.\n\n Exceptions:\n - ValueError: Raised if the specified column name does not exist in the DataFrame.\n\n Requirements:\n - pandas\n - matplotlib\n\n Example:\n >>> sample_data = [{'userId': 1, 'value': 10}, {'userId': 2, 'value': 15}]\n >>> df, ax = f_845(sample_data, 'userId')\n >>> print(df)\n userId value\n 0 1 10\n 1 2 15\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n\n if column_name not in df.columns:\n raise ValueError(f\"Column '{column_name}' not found in the DataFrame.\")\n\n if not pd.api.types.is_numeric_dtype(df[column_name]):\n df[column_name] = df[column_name].astype(\"category\").cat.codes\n\n _, ax = plt.subplots()\n df[column_name].hist(ax=ax)\n ax.set_title(f\"Histogram of {column_name}\")\n ax.set_xlabel(column_name)\n return df, ax", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_845 function.\"\"\"\n def setUp(self):\n # Sample data for testing\n self.sample_data = [\n {\"userId\": 1, \"id\": 1, \"title\": \"A\", \"completed\": False},\n {\"userId\": 1, \"id\": 2, \"title\": \"B\", \"completed\": True},\n {\"userId\": 2, \"id\": 3, \"title\": \"A\", \"completed\": False},\n {\"userId\": 2, \"id\": 4, \"title\": \"B\", \"completed\": True},\n {\"userId\": 3, \"id\": 5, \"title\": \"A\", \"completed\": False},\n {\"userId\": 3, \"id\": 6, \"title\": \"B\", \"completed\": True},\n {\"userId\": 3, \"id\": 7, \"title\": \"B\", \"completed\": True},\n ]\n def test_normal_case(self):\n \"\"\"Test if the function returns correct DataFrame and histogram for a valid column.\"\"\"\n df, ax = f_845(self.sample_data, \"userId\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(len(df), len(self.sample_data))\n self.assertEqual(ax.get_title(), \"Histogram of userId\")\n self.assertEqual(ax.get_xlabel(), \"userId\")\n def test_non_existent_column(self):\n \"\"\"Test if the function raises an error for a non-existent column.\"\"\"\n with self.assertRaises(ValueError):\n f_845(self.sample_data, \"non_existent_column\")\n def test_empty_data(self):\n \"\"\"Test the function with empty data.\"\"\"\n with self.assertRaises(ValueError):\n f_845([], \"userId\")\n def test_non_numeric_data(self):\n \"\"\"Test the function with a non-numeric column.\"\"\"\n df, ax = f_845(self.sample_data, \"title\")\n self.assertTrue(pd.api.types.is_numeric_dtype(df[\"title\"]))\n self.assertEqual(ax.get_title(), \"Histogram of title\")\n self.assertEqual(ax.get_xlabel(), \"title\")\n def test_duplicate_values(self):\n \"\"\"Test the function with a column that has duplicate values.\"\"\"\n df, ax = f_845(self.sample_data, \"title\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(ax.get_title(), \"Histogram of title\")\n self.assertEqual(ax.get_xlabel(), \"title\")\n def tearDown(self):\n plt.clf()", "apis": ["pandas.DataFrame", "pandas.api.types.is_numeric_dtype", "matplotlib.pyplot.subplots", "pandas.api"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Converts a given JSON data into a Pandas DataFrame and plots a histogram of a specified column.", "The function handles non-numeric columns by converting them to categorical type and then to numeric codes.", "It also checks if the specified column exists in the DataFrame.", "- The histogram's title is set to 'Histogram of '.", "- The histogram's x-label are set to the name of the specified column.", "Exceptions:", "- ValueError: Raised if the specified column name does not exist in the DataFrame."], "note": [], "params": ["data (list of dict)", "column_name (str, optional)"], "returns": ["DataFrame: A pandas DataFrame created from the input JSON data.", "Axes: A matplotlib Axes object showing the histogram plot of the specified column."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> sample_data = [{'userId': 1, 'value': 10}, {'userId': 2, 'value': 15}]", ">>> df, ax = f_845(sample_data, 'userId')", ">>> print(df)", "userId value", "0 1 10", "1 2 15"]}} +{"task_id": "f_808", "prompt": "import os\nimport re\nimport shutil\n\n\ndef f_808(source_directory: str, target_directory: str, pattern: str = r\"\\d{4}\") -> int:\n \"\"\"\n Moves files matching a specific regex pattern from a source directory to a target directory.\n\n Parameters:\n - source_directory (str): Path of the source directory from which files will be moved.\n - target_directory (str): Path of the target directory to which files will be moved.\n - pattern (str): Regex pattern to match filenames.\n Defaults to r'\\\\d{4}' to match filenames containing four contiguous digits.\n\n Returns:\n - int: The number of files successfully moved.\n\n Requirements:\n - os\n - re\n - shutil\n\n Note:\n - If source_directory does not exist or is not a directory, this function returns 0.\n - If target_directory does not exist, this function will create it.\n\n Examples:\n >>> os.listdir('/path/to/source')\n ['1000.txt', '1001.txt', '1002.txt', 'not_a_match.txt']\n >>> f_808('/path/to/source', '/path/to/target')\n 3\n \"\"\"", "canonical_solution": " moved_files_count = 0\n\n if not os.path.exists(source_directory) or not os.path.isdir(source_directory):\n return 0\n\n if not os.path.exists(target_directory):\n os.makedirs(target_directory)\n\n for root, _, files in os.walk(source_directory):\n for file in files:\n if re.search(pattern, file):\n shutil.move(\n os.path.join(root, file), os.path.join(target_directory, file)\n )\n moved_files_count += 1\n\n return moved_files_count", "test": "import unittest\nimport tempfile\nimport os\nclass TestCases(unittest.TestCase):\n def create_test_files(self, directory, file_names):\n # Helper to create files for testing\n for file_name in file_names:\n with open(os.path.join(directory, file_name), \"a\") as file:\n file.write(\"test content\")\n def test_files_moved(self):\n # Test basic case with default pattern\n with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:\n self.create_test_files(\n src,\n [\n \"1234.txt\",\n \"test5678.txt\",\n \"nope.txt\",\n \"another1234.txt\",\n \"4321done.txt\",\n ],\n )\n result = f_808(src, dst)\n self.assertEqual(\n result, 4, \"Should move 4 files matching the default pattern.\"\n )\n for file_name in [\n \"1234.txt\",\n \"another1234.txt\",\n \"4321done.txt\",\n \"test5678.txt\",\n ]:\n self.assertTrue(\n os.path.exists(os.path.join(dst, file_name)),\n f\"{file_name} should be in the target directory\",\n )\n def test_files_moved_with_custom_pattern(self):\n # Test case with custom pattern\n with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:\n self.create_test_files(\n src,\n [\n \"1234.txt\",\n \"test5678.txt\",\n \"nope.txt\",\n \"another1234.txt\",\n \"4321done.txt\",\n ],\n )\n result = f_808(src, dst, r\"test\\w+\")\n self.assertEqual(\n result, 1, \"Should move 1 file matching the custom pattern 'test\\\\w+.'\"\n )\n def test_no_files_moved_if_no_match(self):\n # Test no match\n with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:\n self.create_test_files(src, [\"nope.txt\"])\n result = f_808(src, dst)\n self.assertEqual(result, 0, \"Should move 0 files if no match.\")\n def test_return_zero_if_source_does_not_exist(self):\n # Test source_directory if not exists\n with tempfile.TemporaryDirectory() as dst:\n result = f_808(os.path.join(dst, \"non_existing_dir\"), dst)\n self.assertEqual(\n result, 0, \"Should return 0 if source directory does not exist.\"\n )\n def test_target_directory_created_if_not_exist(self):\n # Test that destination directory will be created if it did not exist\n with tempfile.TemporaryDirectory() as src:\n self.create_test_files(src, [\"1234.txt\"])\n new_target = os.path.join(src, \"new_target_dir\")\n f_808(src, new_target)\n self.assertTrue(\n os.path.exists(new_target),\n \"Target directory should be created if it does not exist.\",\n )\n def test_no_files_in_source(self):\n # Test empty source direcotry\n with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:\n result = f_808(src, dst)\n self.assertEqual(\n result, 0, \"Should move 0 files if source directory is empty.\"\n )", "apis": ["os.makedirs", "os.walk", "re.search", "os.path", "os.path.isdir", "os.path.join", "os.path.exists", "shutil.move"], "libs": ["os", "shutil", "re"], "doc": {"description": ["Moves files matching a specific regex pattern from a source directory to a target directory."], "note": ["If source_directory does not exist or is not a directory, this function returns 0.", "If target_directory does not exist, this function will create it."], "params": ["source_directory (str): Path of the source directory from which files will be moved.", "target_directory (str): Path of the target directory to which files will be moved.", "pattern (str): Regex pattern to match filenames.", "Defaults to r'\\\\d{4}' to match filenames containing four contiguous digits."], "returns": ["int: The number of files successfully moved."], "reqs": ["os", "re", "shutil"], "raises": [], "example": ["Examples:", ">>> os.listdir('/path/to/source')", "['1000.txt', '1001.txt', '1002.txt', 'not_a_match.txt']", ">>> f_808('/path/to/source', '/path/to/target')", "3"]}} +{"task_id": "f_421", "prompt": "import sqlite3\nimport numpy as np\nfrom random import choice, seed\n\n\ndef f_421(db_path, table_name, num_entries, random_seed=None):\n \"\"\"\n Insert random data into an SQLite3 table that contains random names, ages, and heights.\n If the table does not exist, it will be created.\n This function uses the following constants:\n - NAMES: List of possible names ['John', 'Jane', 'Steve', 'Emma', 'Liam', 'Olivia'].\n - AGES: Range of possible ages from 18 to 64.\n - HEIGHTS: Range of possible heights from 150cm to 199cm.\n\n Parameters:\n db_path (str): The path to the SQLite3 database file.\n table_name (str): The name of the table to insert data into.\n num_entries (int): The number of entries to insert. Must not be negative.\n random_seed (int, optional): Seed for random number generation. Defaults to None (no fixed seed).\n\n Returns:\n int: The number of rows inserted.\n\n Requirements:\n - sqlite3\n - numpy\n - random.choice\n - random.seed\n\n Example:\n >>> f_421('path_to_test.db', 'People', 100, random_seed=42)\n 100\n \"\"\"", "canonical_solution": " # Setting the random seed if provided\n if random_seed is not None:\n seed(random_seed)\n np.random.seed(random_seed)\n\n if num_entries < 0:\n raise ValueError(\"num_entries cannot be negative.\")\n\n NAMES = [\"John\", \"Jane\", \"Steve\", \"Emma\", \"Liam\", \"Olivia\"]\n AGES = list(range(18, 65))\n HEIGHTS = list(range(150, 200))\n\n conn = sqlite3.connect(db_path)\n cur = conn.cursor()\n\n table_creation_sql = (\n \"CREATE TABLE IF NOT EXISTS {} (name TEXT, age INTEGER, height INTEGER)\".format(\n table_name\n )\n )\n cur.execute(table_creation_sql)\n\n inserted_rows = 0\n for _ in range(num_entries):\n name = choice(NAMES)\n age = choice(AGES)\n height = choice(HEIGHTS)\n insertion_sql = \"INSERT INTO {} VALUES (?, ?, ?)\".format(table_name)\n cur.execute(insertion_sql, (name, age, height))\n inserted_rows += cur.rowcount\n\n conn.commit()\n\n return inserted_rows", "test": "import unittest\nimport os\nimport sqlite3\nimport tempfile\nclass TestCases(unittest.TestCase):\n NAMES = [\"John\", \"Jane\", \"Steve\", \"Emma\", \"Liam\", \"Olivia\"]\n AGES = range(18, 65)\n HEIGHTS = range(150, 200)\n def setUp(self):\n # Setup a temporary directory before each test\n self.temp_dir = tempfile.TemporaryDirectory()\n self.db_path = os.path.join(self.temp_dir.name, \"test.db\")\n def tearDown(self):\n # Clean up the temporary directory after each test\n self.temp_dir.cleanup()\n def test_case_1(self):\n # Test inserting 50 entries with a fixed seed\n result = f_421(self.db_path, \"SamplePeople\", 50, random_seed=42)\n self.assertEqual(result, 50)\n def test_case_2(self):\n # Test inserting 30 entries into a new table with a fixed seed\n result = f_421(self.db_path, \"NewPeople\", 30, random_seed=42)\n self.assertEqual(result, 30)\n def test_case_3(self):\n # Test inserting 20 entries, verifying smaller batch works as expected\n result = f_421(self.db_path, \"SamplePeople\", 20, random_seed=42)\n self.assertEqual(result, 20)\n def test_case_4(self):\n # Test inserting a large number of entries (200) with a fixed seed\n result = f_421(self.db_path, \"SamplePeople\", 200, random_seed=42)\n self.assertEqual(result, 200)\n def test_case_5(self):\n # Test inserting 0 entries to check handling of empty input\n result = f_421(self.db_path, \"SamplePeople\", 0, random_seed=42)\n self.assertEqual(result, 0)\n def test_case_6(self):\n # Test the content of the rows for correctness against expected values\n f_421(self.db_path, \"ContentCheck\", 10, random_seed=42)\n conn = sqlite3.connect(self.db_path)\n cur = conn.cursor()\n cur.execute(\"SELECT * FROM ContentCheck\")\n rows = cur.fetchall()\n for row in rows:\n self.assertIn(row[0], self.NAMES)\n self.assertIn(row[1], self.AGES)\n self.assertIn(row[2], self.HEIGHTS)\n def test_case_7(self):\n # Test invalid db path\n with self.assertRaises(sqlite3.OperationalError):\n f_421(\"/invalid/path.db\", \"TestTable\", 10)\n def test_case_8(self):\n # Test invalid table names (SQL keywords)\n with self.assertRaises(sqlite3.OperationalError):\n f_421(self.db_path, \"Select\", 10)\n def test_case_9(self):\n # Test handling invalid num_entries\n with self.assertRaises(Exception):\n f_421(self.db_path, \"TestTable\", -1)\n with self.assertRaises(TypeError):\n f_421(self.db_path, \"TestTable\", \"ten\")\n def test_case_10(self):\n # Test handling invalid random seed\n with self.assertRaises(Exception):\n f_421(self.db_path, \"TestTable\", 10, random_seed=\"invalid\")\n def test_case_11(self):\n # Test different schema in existing table\n conn = sqlite3.connect(self.db_path)\n cur = conn.cursor()\n cur.execute(\"CREATE TABLE TestTable (id INTEGER)\")\n conn.close()\n with self.assertRaises(sqlite3.OperationalError):\n f_421(self.db_path, \"TestTable\", 10)\n def test_case_12(self):\n # Insert a known set of data and verify its integrity\n f_421(self.db_path, \"IntegrityCheck\", 1, random_seed=42)\n conn = sqlite3.connect(self.db_path)\n cur = conn.cursor()\n cur.execute(\"SELECT * FROM IntegrityCheck\")\n row = cur.fetchone()\n self.assertIsNotNone(row)\n def test_case_13(self):\n # Test against SQL injection in table_name parameter\n malicious_name = \"Test; DROP TABLE IntegrityCheck;\"\n with self.assertRaises(sqlite3.OperationalError):\n f_421(self.db_path, malicious_name, 1)", "apis": ["numpy.random", "random.seed", "random.choice", "numpy.random.seed", "sqlite3.connect"], "libs": ["random", "numpy", "sqlite3"], "doc": {"description": ["Insert random data into an SQLite3 table that contains random names, ages, and heights.", "If the table does not exist, it will be created.", "This function uses the following constants:", "- NAMES: List of possible names ['John', 'Jane', 'Steve', 'Emma', 'Liam', 'Olivia'].", "- AGES: Range of possible ages from 18 to 64.", "- HEIGHTS: Range of possible heights from 150cm to 199cm."], "note": [], "params": ["db_path (str): The path to the SQLite3 database file.", "table_name (str): The name of the table to insert data into.", "num_entries (int): The number of entries to insert. Must not be negative.", "random_seed (int, optional): Seed for random number generation. Defaults to None (no fixed seed)."], "returns": ["int: The number of rows inserted."], "reqs": ["sqlite3", "numpy", "random.choice", "random.seed"], "raises": [], "example": [">>> f_421('path_to_test.db', 'People', 100, random_seed=42)", "100"]}} +{"task_id": "f_426", "prompt": "from collections import Counter\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport itertools\n\ndef f_426(list_of_menuitems, title=\"Menu Distribution\", color=\"blue\", width=1.0):\n \"\"\"\n Given a nested list of menu items, flatten the list using itertool chain, count the occurrences of each item, then\n plot a histogram with an alphabetically sorted x-axis labeled as \"Menu Items\" and y-axis as \"Frequency\".\n\n Parameters:\n - list_of_menuitems (list): A non-empty nested list of menu items. Each element is a list of menu item strings.\n - title (str, optional): The title of the histogram plot. Default is \"Menu Distribution\".\n - color (str, optional): The color of the bars in the histogram. Default is \"blue\".\n - width (float, optional): The width of the bars in the histogram. Default is 1.0.\n\n Returns:\n - ax (object): An Axes object representing the histogram plot.\n\n Requirements:\n - collections.Counter\n - numpy\n - matplotlib.pyplot\n - itertools\n\n Example:\n >>> f_426([['Pizza', 'Burger'], ['Pizza', 'Coke'], ['Pasta', 'Coke']])\n \n >>> f_426(['Burger'], title='A Title', color='red', width=5.0)\n \n \"\"\"", "canonical_solution": " # Flatten the list\n flat_list = list(itertools.chain(*list_of_menuitems))\n\n # Count the occurrences of each menu item\n counter = Counter(flat_list)\n labels, values = zip(*sorted(counter.items(), key=lambda x: x[0]))\n indexes = np.arange(len(labels))\n\n # Plot the histogram\n fig, ax = plt.subplots()\n ax.bar(indexes, values, width, color=color)\n ax.set_xticklabels(labels)\n ax.set_xlabel(\"Menu Items\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(title)\n\n return ax", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n input_data = [[\"Pizza\", \"Burger\"], [\"Pizza\", \"Coke\"], [\"Pasta\", \"Coke\"]]\n ax = f_426(input_data)\n # Test default plot properties\n self.assertEqual(ax.get_title(), \"Menu Distribution\")\n self.assertEqual(ax.get_xlabel(), \"Menu Items\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n for p in ax.patches:\n # RGBA color\n self.assertEqual(p.get_facecolor(), (0.0, 0.0, 1.0, 1.0))\n # bar width\n self.assertEqual(p.get_width(), 1.0)\n def test_case_2(self):\n input_data = [[\"Pizza\", \"Burger\"], [\"Pizza\", \"Coke\"], [\"Pasta\", \"Coke\"]]\n ax = f_426(input_data, title=\"Custom Title\", color=\"red\", width=0.8)\n # Test custom plot properties\n self.assertEqual(ax.get_title(), \"Custom Title\")\n self.assertEqual(ax.get_xlabel(), \"Menu Items\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n for p in ax.patches:\n # RGBA color\n self.assertEqual(p.get_facecolor(), (1.0, 0.0, 0.0, 1.0))\n # bar width\n self.assertEqual(p.get_width(), 0.8)\n def test_case_3(self):\n input_data = [[\"Burger\"], [\"Pizza\"], [\"Pasta\"]]\n ax = f_426(input_data)\n # Test count\n bars = [p.get_height() for p in ax.patches]\n self.assertEqual(bars, [1, 1, 1])\n def test_case_4(self):\n input_data = [[\"Carrot\", \"Apple\"], [\"Apple\", \"Banana\"], [\"Banana\"]]\n ax = f_426(input_data)\n # Test x-axis order\n self.assertEqual(\n [_._text for _ in ax.get_xticklabels() if _._text],\n [\"Apple\", \"Banana\", \"Carrot\"],\n )\n def test_case_5(self):\n # Test input edge case: some empty elements\n ax = f_426([[], [\"Apple\"]])\n self.assertEqual(len(ax.patches), 1)\n for p in ax.patches:\n # bar width\n self.assertEqual(p.get_width(), 1.0)\n self.assertEqual(p.get_height(), 1)\n def test_case_6(self):\n with self.assertRaises(ValueError):\n f_426([])\n with self.assertRaises(ValueError):\n f_426([[]])\n with self.assertRaises(ValueError):\n f_426(\"\")\n with self.assertRaises(TypeError):\n f_426(None)\n with self.assertRaises(TypeError):\n f_426(1)\n with self.assertRaises(TypeError):\n f_426([1])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "collections.Counter", "matplotlib.pyplot.subplots", "itertools.chain"], "libs": ["itertools", "collections", "numpy", "matplotlib"], "doc": {"description": ["Given a nested list of menu items, flatten the list using itertool chain, count the occurrences of each item, then", "plot a histogram with an alphabetically sorted x-axis labeled as \"Menu Items\" and y-axis as \"Frequency\"."], "note": [], "params": ["list_of_menuitems (list): A non-empty nested list of menu items. Each element is a list of menu item strings.", "title (str, optional): The title of the histogram plot. Default is \"Menu Distribution\".", "color (str, optional): The color of the bars in the histogram. Default is \"blue\".", "width (float, optional): The width of the bars in the histogram. Default is 1.0."], "returns": ["ax (object): An Axes object representing the histogram plot."], "reqs": ["collections.Counter", "numpy", "matplotlib.pyplot", "itertools"], "raises": [], "example": [">>> f_426([['Pizza', 'Burger'], ['Pizza', 'Coke'], ['Pasta', 'Coke']])", "", ">>> f_426(['Burger'], title='A Title', color='red', width=5.0)", ""]}} +{"task_id": "f_749", "prompt": "import os\nimport re\n\ndef f_749(pattern: str, replacement: str, directory: str) -> bool:\n \"\"\"\n Renames all files in a directory that match a particular pattern with a given replacement string.\n \n Functionality:\n - Searches for files in the specified directory whose names contain the given pattern.\n - Renames those files by replacing the pattern with the given replacement string.\n \n Input:\n - pattern (str): The pattern to search for in the filenames.\n - replacement (str): The string to replace the pattern with.\n - directory (str): The directory in which to search for files.\n \n Output:\n - Returns a boolean value. True if the operation was successful, otherwise False.\n \n Requirements:\n - re\n - os\n\n Examples:\n >>> f_749('draft', 'final', '/home/user/documents')\n True\n >>> f_749('tmp', 'temp', '/home/user/downloads')\n False\n \"\"\"", "canonical_solution": " try:\n for file in os.listdir(directory):\n if re.search(pattern, file):\n new_filename = re.sub(pattern, replacement, file)\n os.rename(os.path.join(directory, file), os.path.join(directory, new_filename))\n return True\n except Exception as e:\n return False", "test": "import unittest\nimport tempfile\nimport shutil\nfrom pathlib import Path\nclass TestCases(unittest.TestCase):\n \n def setUp(self):\n self.test_dir = tempfile.mkdtemp()\n \n def tearDown(self):\n shutil.rmtree(self.test_dir)\n \n def create_test_files(self, filenames):\n for filename in filenames:\n Path(f\"{self.test_dir}/{filename}\").touch()\n \n def test_renaming_files(self):\n self.create_test_files([\"draft1.txt\", \"draft2.txt\", \"draft3.txt\"])\n result = f_749(\"draft\", \"final\", self.test_dir)\n self.assertTrue(result)\n expected_files = sorted([\"final1.txt\", \"final2.txt\", \"final3.txt\"])\n actual_files = sorted(os.listdir(self.test_dir))\n self.assertEqual(expected_files, actual_files)\n \n def test_no_matching_files(self):\n self.create_test_files([\"file1.txt\", \"file2.txt\", \"file3.txt\"])\n result = f_749(\"draft\", \"final\", self.test_dir)\n self.assertTrue(result)\n expected_files = sorted([\"file1.txt\", \"file2.txt\", \"file3.txt\"])\n actual_files = sorted(os.listdir(self.test_dir))\n self.assertEqual(expected_files, actual_files)\n \n def test_nonexistent_directory(self):\n result = f_749(\"draft\", \"final\", \"/nonexistent/directory\")\n self.assertFalse(result)\n \n def test_empty_directory(self):\n result = f_749(\"draft\", \"final\", self.test_dir)\n self.assertTrue(result)\n self.assertEqual([], os.listdir(self.test_dir))\n \n def test_complex_pattern_renaming(self):\n self.create_test_files([\"draft_file1.txt\", \"file_draft2.txt\", \"draft3file.txt\"])\n result = f_749(\"draft\", \"final\", self.test_dir)\n self.assertTrue(result)\n expected_files = sorted([\"final_file1.txt\", \"file_final2.txt\", \"final3file.txt\"])\n actual_files = sorted(os.listdir(self.test_dir))\n self.assertEqual(expected_files, actual_files)", "apis": ["os.listdir", "re.search", "re.sub", "os.path", "os.rename", "os.path.join"], "libs": ["os", "re"], "doc": {"description": ["Renames all files in a directory that match a particular pattern with a given replacement string.", "Functionality:", "- Searches for files in the specified directory whose names contain the given pattern.", "- Renames those files by replacing the pattern with the given replacement string.", "Input:", "- pattern (str): The pattern to search for in the filenames.", "- replacement (str): The string to replace the pattern with.", "- directory (str): The directory in which to search for files.", "Output:", "- Returns a boolean value. True if the operation was successful, otherwise False."], "note": [], "params": [], "returns": [], "reqs": ["re", "os"], "raises": [], "example": ["Examples:", ">>> f_749('draft', 'final', '/home/user/documents')", "True", ">>> f_749('tmp', 'temp', '/home/user/downloads')", "False"]}} +{"task_id": "f_922", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_922(arr):\n \"\"\"\n Analyzes the distribution of values in a NumPy array to determine if it is uniform and\n generates a histogram representing this distribution.\n\n Parameters:\n - arr (numpy.ndarray): A NumPy array containing the values to be analyzed. \n The array can contain any hashable data type (e.g., integers, floats, strings).\n\n Returns:\n - tuple: A tuple containing two elements:\n - uniform_distribution (bool): A boolean value indicating whether the distribution is uniform. \n - Returns True if every unique value in the array appears the same number of times,\n indicating a uniform distribution.\n - Returns False otherwise.\n - ax (matplotlib.axes.Axes): An Axes object displaying the histogram of the array's value distribution.\n - The histogram's bins correspond to the unique values in the array.\n - The frequency of each unique value is represented by the height of the corresponding bin.\n\n Note:\n - The bin is set to `np.arange(len(unique) + 1) - 0.5` to align each bin with its corresponding unique value.\n\n Requirements:\n - numpy\n - matplotlib\n\n Example:\n >>> arr = np.array([\"A\", \"A\", \"B\", \"B\"])\n >>> is_uniform, ax = f_922(arr)\n >>> is_uniform\n True\n \"\"\"", "canonical_solution": " unique, counts = np.unique(arr, return_counts=True)\n uniform_distribution = len(set(counts)) == 1\n\n _, ax = plt.subplots()\n ax.hist(arr, bins=np.arange(len(unique) + 1) - 0.5, rwidth=0.8, align=\"mid\")\n ax.set_xticks(range(len(unique)))\n ax.set_xticklabels(unique)\n\n return uniform_distribution, ax", "test": "import numpy as np\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_922\"\"\"\n def test_uniform_distribution(self):\n \"\"\"Test uniform distribution.\"\"\"\n arr = np.array([\"A\", \"A\", \"B\", \"B\"])\n uniform, _ = f_922(arr)\n self.assertTrue(uniform)\n def test_non_uniform_distribution(self):\n \"\"\"Test non-uniform distribution.\"\"\"\n arr = np.array([\"A\", \"A\", \"B\", \"B\", \"B\", \"C\", \"C\", \"C\", \"C\", \"D\", \"E\", \"E\"])\n uniform, _ = f_922(arr)\n self.assertFalse(uniform)\n def test_single_value(self):\n \"\"\"Test single value.\"\"\"\n arr = np.array([\"A\", \"A\", \"A\", \"A\"])\n uniform, _ = f_922(arr)\n self.assertTrue(uniform)\n def test_multiple_equal_values(self):\n \"\"\"Test multiple equal values.\"\"\"\n arr = np.array([\"A\", \"A\", \"B\", \"B\", \"C\", \"C\", \"D\", \"D\"])\n uniform, _ = f_922(arr)\n self.assertTrue(uniform)\n def test_varying_values(self):\n \"\"\"Test varying values.\"\"\"\n arr = np.array([\"A\", \"B\", \"B\", \"C\", \"C\", \"C\", \"D\", \"D\", \"D\", \"D\"])\n uniform, _ = f_922(arr)\n self.assertFalse(uniform)\n def tearDown(self):\n plt.close()", "apis": ["numpy.unique", "matplotlib.pyplot.subplots", "numpy.arange"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Analyzes the distribution of values in a NumPy array to determine if it is uniform and", "generates a histogram representing this distribution."], "note": ["The bin is set to `np.arange(len(unique) + 1) - 0.5` to align each bin with its corresponding unique value."], "params": ["arr (numpy.ndarray): A NumPy array containing the values to be analyzed.", "The array can contain any hashable data type (e.g., integers, floats, strings)."], "returns": ["tuple: A tuple containing two elements:", "uniform_distribution (bool): A boolean value indicating whether the distribution is uniform.", "Returns True if every unique value in the array appears the same number of times,", "indicating a uniform distribution.", "Returns False otherwise.", "ax (matplotlib.axes.Axes): An Axes object displaying the histogram of the array's value distribution.", "The histogram's bins correspond to the unique values in the array.", "The frequency of each unique value is represented by the height of the corresponding bin."], "reqs": ["numpy", "matplotlib"], "raises": [], "example": [">>> arr = np.array([\"A\", \"A\", \"B\", \"B\"])", ">>> is_uniform, ax = f_922(arr)", ">>> is_uniform", "True"]}} +{"task_id": "f_920", "prompt": "from datetime import datetime\nimport pandas as pd\n\n# For Python versions lower than 3.9, use 'pytz' instead of 'zoneinfo'\ntry:\n from zoneinfo import ZoneInfo\nexcept ImportError:\n from pytz import timezone as ZoneInfo\n\nTIME_FORMAT = \"%d/%m/%y %H:%M:%S.%f\"\n\ndef f_920(time_strings, target_tz):\n \"\"\"\n Convert a list of time strings from UTC to a specified timezone and return a DataFrame.\n\n The function processes each UTC time string in the given list,\n converts it to the specified timezone, and stores the results in a DataFrame.\n\n Parameters:\n - time_strings (list of str): A list of time strings in UTC. Each string should be formatted as 'dd/mm/yy HH:MM:SS.fff'.\n - target_tz (str): The timezone identifier (e.g., 'America/New_York') to which the time strings should be converted.\n\n Returns:\n - pandas.DataFrame: A DataFrame with two columns: 'Original Time'\n containing the UTC times and 'Converted Time' containing the times converted to the target timezone.\n\n Requirements:\n - pandas\n - datetime\n - zoneinfo.ZoneInfo (Python 3.9+) or pytz.timezone.ZoneInfo (Python < 3.9)\n \n Note:\n - The function assumes that the input times are in UTC.\n\n Example:\n >>> time_strings = ['30/03/09 16:31:32.123', '15/04/10 14:25:46.789', '20/12/11 12:34:56.000']\n >>> df = f_920(time_strings, 'America/New_York')\n >>> print(df)\n Original Time Converted Time\n 0 30/03/09 16:31:32.123 30/03/09 12:31:32.123000\n 1 15/04/10 14:25:46.789 15/04/10 10:25:46.789000\n 2 20/12/11 12:34:56.000 20/12/11 07:34:56.000000\n \"\"\"", "canonical_solution": " data = []\n\n for time_string in time_strings:\n utc_time = datetime.strptime(time_string, TIME_FORMAT)\n converted_time = utc_time.replace(tzinfo=ZoneInfo(\"UTC\")).astimezone(\n ZoneInfo(target_tz)\n )\n data.append([time_string, converted_time.strftime(TIME_FORMAT)])\n\n df = pd.DataFrame(data, columns=[\"Original Time\", \"Converted Time\"])\n return df", "test": "import unittest\ntry:\n from zoneinfo import ZoneInfo\nexcept ImportError:\n from pytz import timezone as ZoneInfo\n# Test cases\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_920\"\"\"\n def test_conversion_from_utc(self):\n \"\"\"Test conversion from UTC to Eastern Standard Time.\"\"\"\n time_strings = [\"01/01/21 00:00:00.000\", \"01/01/21 12:00:00.000\"]\n df = f_920(time_strings, \"America/New_York\")\n expected = [\"31/12/20 19:00:00.000000\", \"01/01/21 07:00:00.000000\"]\n self.assertEqual(list(df[\"Converted Time\"]), expected)\n def test_conversion_from_non_utc(self):\n \"\"\"Test conversion from Eastern Standard Time to India Standard Time.\"\"\"\n time_strings = [\"01/01/21 00:00:00.000\", \"01/01/21 12:00:00.000\"]\n df = f_920(time_strings, \"Asia/Kolkata\")\n expected = [\"01/01/21 05:30:00.000000\", \"01/01/21 17:30:00.000000\"]\n self.assertEqual(list(df[\"Converted Time\"]), expected)\n def test_empty_list(self):\n \"\"\"Test empty list.\"\"\"\n df = f_920([], \"America/New_York\")\n self.assertEqual(len(df), 0)\n def test_invalid_time_string(self):\n \"\"\"Test invalid time string.\"\"\"\n with self.assertRaises(ValueError):\n f_920([\"invalid_time_string\"], \"America/New_York\")\n def test_non_standard_time_format(self):\n \"\"\"Test handling of non-standard time format.\"\"\"\n time_strings = [\"2021-01-01 00:00:00\"]\n with self.assertRaises(ValueError):\n f_920(time_strings, \"America/New_York\")", "apis": ["pandas.DataFrame", "pytz.timezone", "datetime.datetime.strptime"], "libs": ["pandas", "pytz", "datetime"], "doc": {"description": ["Convert a list of time strings from UTC to a specified timezone and return a DataFrame.", "The function processes each UTC time string in the given list,", "converts it to the specified timezone, and stores the results in a DataFrame."], "note": ["The function assumes that the input times are in UTC."], "params": ["time_strings (list of str): A list of time strings in UTC. Each string should be formatted as 'dd/mm/yy HH:MM:SS.fff'.", "target_tz (str): The timezone identifier (e.g., 'America/New_York') to which the time strings should be converted."], "returns": ["pandas.DataFrame: A DataFrame with two columns: 'Original Time'", "containing the UTC times and 'Converted Time' containing the times converted to the target timezone."], "reqs": ["pandas", "datetime", "zoneinfo.ZoneInfo (Python 3.9+) or pytz.timezone.ZoneInfo (Python < 3.9)"], "raises": [], "example": [">>> time_strings = ['30/03/09 16:31:32.123', '15/04/10 14:25:46.789', '20/12/11 12:34:56.000']", ">>> df = f_920(time_strings, 'America/New_York')", ">>> print(df)", "Original Time Converted Time", "0 30/03/09 16:31:32.123 30/03/09 12:31:32.123000", "1 15/04/10 14:25:46.789 15/04/10 10:25:46.789000", "2 20/12/11 12:34:56.000 20/12/11 07:34:56.000000"]}} +{"task_id": "f_926", "prompt": "import pandas as pd\nfrom scipy.stats import pearsonr\n\n\ndef f_926(data):\n \"\"\"\n Calculates the Pearson correlation coefficient between numerical scores and categorical grades.\n\n This function performs three main tasks:\n 1. Converts scores from string format to floats.\n 2. Encodes categorical grades into numerical values based on their rank order.\n 3. Computes the Pearson correlation coefficient between the numerical scores and the encoded grades.\n\n Parameters:\n - data (dict): A dictionary containing two keys:\n - 'Score_String': A list of scores in string format.\n - 'Grade': A list of corresponding grades in string format.\n Each list under these keys must have the same length.\n\n Returns:\n - correlation (float): The Pearson correlation coefficient between the converted numerical scores and encoded grades.\n Returns NaN if the input data frame has less than 2 rows, as the correlation coefficient cannot be calculated in this case.\n\n Requirements:\n - pandas\n - scipy\n\n Example:\n >>> f_926({'Score_String': ['80.5', '85.7', '90.2'], 'Grade': ['B', 'B+', 'A-']})\n -0.46351538587606683\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data)\n if len(df) < 2: # Check if the data frame has less than 2 rows\n return float(\"nan\") # or return None\n\n df[\"Score_Float\"] = df[\"Score_String\"].astype(float)\n df[\"Grade_Encoded\"] = df[\"Grade\"].astype(\"category\").cat.codes\n correlation = pearsonr(df[\"Score_Float\"], df[\"Grade_Encoded\"])[0]\n return correlation", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_926\"\"\"\n def test_normal_operation(self):\n \"\"\"\n Test normal operation with valid input.\n \"\"\"\n data = {\"Score_String\": [\"80.5\", \"85.7\", \"90.2\"], \"Grade\": [\"B\", \"B+\", \"A-\"]}\n result = f_926(data)\n self.assertIsInstance(result, float)\n def test_empty_input(self):\n \"\"\"\n Test the function with empty input.\n \"\"\"\n data = {\"Score_String\": [], \"Grade\": []}\n result = f_926(data)\n self.assertTrue(pd.isna(result))\n def test_invalid_score_format(self):\n \"\"\"\n Test the function with invalid score format.\n \"\"\"\n data = {\"Score_String\": [\"eighty\", \"85.7\", \"90.2\"], \"Grade\": [\"B\", \"B+\", \"A-\"]}\n with self.assertRaises(ValueError):\n f_926(data)\n def test_mismatched_lengths(self):\n \"\"\"\n Test the function with mismatched lengths of scores and grades.\n \"\"\"\n data = {\"Score_String\": [\"80.5\", \"85.7\"], \"Grade\": [\"B\", \"B+\", \"A-\"]}\n with self.assertRaises(ValueError):\n f_926(data)\n def test_non_ordinal_grades(self):\n \"\"\"\n Test the function with non-ordinal grade inputs.\n \"\"\"\n data = {\n \"Score_String\": [\"80.5\", \"85.7\", \"90.2\"],\n \"Grade\": [\"Pass\", \"Fail\", \"Pass\"],\n }\n result = f_926(data)\n self.assertIsInstance(result, float)", "apis": ["pandas.DataFrame", "scipy.stats.pearsonr"], "libs": ["pandas", "scipy"], "doc": {"description": ["Calculates the Pearson correlation coefficient between numerical scores and categorical grades.", "This function performs three main tasks:", "1. Converts scores from string format to floats.", "2. Encodes categorical grades into numerical values based on their rank order.", "3. Computes the Pearson correlation coefficient between the numerical scores and the encoded grades."], "note": [], "params": ["data (dict): A dictionary containing two keys:", "'Score_String': A list of scores in string format.", "'Grade': A list of corresponding grades in string format.", "Each list under these keys must have the same length."], "returns": ["correlation (float): The Pearson correlation coefficient between the converted numerical scores and encoded grades.", "Returns NaN if the input data frame has less than 2 rows, as the correlation coefficient cannot be calculated in this case."], "reqs": ["pandas", "scipy"], "raises": [], "example": [">>> f_926({'Score_String': ['80.5', '85.7', '90.2'], 'Grade': ['B', 'B+', 'A-']})", "-0.46351538587606683"]}} +{"task_id": "f_797", "prompt": "import random\nimport re\n\n\ndef f_797(target_words, n_sentences, vocabulary):\n \"\"\"\n Generate sentences with spaces in certain target words replaced by underscores.\n\n Parameters:\n - target_words (list of str): List of words/phrases where spaces should be replaced with underscores.\n - n_sentences (int): Number of sentences to generate. Must not be negative.\n - vocabulary (list of str): List of words to use for generating sentences. Must not be empty.\n\n Returns:\n - list of str: A list of generated sentences in all lowercase, with specified words/phrases underscored.\n\n Raises:\n - ValueError: If n_sentences is negative or if the vocabulary is empty.\n\n Requirements:\n - random\n - re\n\n Notes:\n - Each sentence is generated by randomly sampling 10 words with replacement from a vocabulary,\n then concatenating with a single whitespace. Then, if any words from the target_words list\n appear in these sentences, spaces within those words are replaced with underscores; here the\n modification is insensitive to the case of the letters.\n - The function returns the processed sentences as a list of all lowercase strings.\n\n Examples:\n >>> random.seed(42)\n >>> f_797(['apple banana'], 1, ['apple', 'banana', 'cherry'])\n ['banana apple apple apple cherry cherry cherry apple_banana apple']\n >>> f_797(['Alice Charlie', 'ALICE BOB', 'aLiCe dAn'], 1, ['alice', 'bob', 'charlie', 'dan'])\n ['alice_charlie alice alice_charlie charlie alice_charlie dan alice']\n \"\"\"", "canonical_solution": " if n_sentences < 0:\n raise ValueError(\"n_sentences cannot be negative.\")\n if not vocabulary:\n raise ValueError(\"Vocabulary cannot be empty.\")\n\n sentences = []\n for _ in range(n_sentences):\n sentence = \" \".join(random.choices(vocabulary, k=10))\n for word in target_words:\n pattern = re.compile(re.escape(word), re.IGNORECASE)\n sentence = pattern.sub(word.replace(\" \", \"_\"), sentence)\n sentences.append(sentence.lower())\n return sentences", "test": "import unittest\nimport random\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.vocabulary = [\n \"apple\",\n \"banana\",\n \"cherry\",\n \"date\",\n \"elderberry\",\n \"fig\",\n \"grape\",\n \"honeydew\",\n ]\n random.seed(42)\n def test_case_1(self):\n # Test with multiple target words and sentences\n target_words = [\"apple banana\", \"banana cherry\"]\n n_sentences = 1000\n results = f_797(target_words, n_sentences, [\"apple\", \"banana\", \"cherry\"])\n self.assertEqual(len(results), n_sentences)\n for target in target_words:\n underscored_target = target.replace(\" \", \"_\")\n self.assertTrue(\n any(underscored_target in sentence for sentence in results),\n f\"{underscored_target} not found in any sentences\",\n )\n def test_case_2(self):\n # Test with a single target word in multiple occurrences\n target_words = [\"apple\"]\n n_sentences = 1\n results = f_797(target_words, n_sentences, [\"apple\"] * 10)\n self.assertEqual(len(results), n_sentences)\n self.assertTrue(\n results[0].count(\"apple\") > 1,\n \"Multiple 'apple' occurrences not replaced correctly\",\n )\n def test_case_3(self):\n # Test with no target words\n target_words = []\n n_sentences = 1\n results = f_797(target_words, n_sentences, self.vocabulary)\n self.assertEqual(len(results), n_sentences)\n self.assertTrue(all(\" \" in sentence for sentence in results), \"\")\n def test_case_4(self):\n # Test case sensitivity\n target_words = [\"Apple Banana\"]\n n_sentences = 2\n results = f_797(target_words, n_sentences, self.vocabulary + [\"apple banana\"])\n self.assertEqual(len(results), n_sentences)\n for result in results:\n self.assertIn(\n \"apple_banana\", result, \"Case sensitivity not handled properly\"\n )\n def test_case_5(self):\n # Test generating zero sentences\n target_words = [\"apple\"]\n n_sentences = 0\n results = f_797(target_words, n_sentences, self.vocabulary)\n self.assertEqual(len(results), n_sentences, \"No sentences should be generated\")\n def test_case_6(self):\n # Test function handling invalid inputs - vocabulary\n target_words = [\"apple\"]\n n_sentences = 1\n with self.assertRaises(ValueError):\n f_797(target_words, n_sentences, [])\n def test_case_7(self):\n # Test function handling invalid inputs - n_sentences\n target_words = [\"apple\"]\n with self.assertRaises(ValueError):\n f_797(target_words, -1, self.vocabulary)\n with self.assertRaises(TypeError):\n f_797(target_words, 1.0, self.vocabulary)\n def test_case_8(self):\n # Test whitespace target word\n target_words = [\" \"]\n n_sentences = 1\n results = f_797(target_words, n_sentences, [\"apple banana\", \"cherry\"])\n assert len(results[0].split(\"_\")) >= 10\n def test_case_9(self):\n # Test target word not in vocabulary\n target_words = [\"mango\"]\n n_sentences = 2\n results = f_797(target_words, n_sentences, [\"apple\", \"banana\", \"cherry\"])\n for sentence in results:\n self.assertNotIn(\n \"mango\",\n sentence,\n \"Target word not in vocabulary should not appear in sentences.\",\n )", "apis": ["re.IGNORECASE", "re.compile", "re.escape", "random.choices"], "libs": ["re", "random"], "doc": {"description": ["Generate sentences with spaces in certain target words replaced by underscores.", "Notes:", "- Each sentence is generated by randomly sampling 10 words with replacement from a vocabulary,", "then concatenating with a single whitespace. Then, if any words from the target_words list", "appear in these sentences, spaces within those words are replaced with underscores; here the", "modification is insensitive to the case of the letters.", "- The function returns the processed sentences as a list of all lowercase strings."], "note": [], "params": ["target_words (list of str): List of words/phrases where spaces should be replaced with underscores.", "n_sentences (int): Number of sentences to generate. Must not be negative.", "vocabulary (list of str): List of words to use for generating sentences. Must not be empty."], "returns": ["list of str: A list of generated sentences in all lowercase, with specified words/phrases underscored."], "reqs": ["random", "re"], "raises": ["ValueError: If n_sentences is negative or if the vocabulary is empty."], "example": ["Examples:", ">>> random.seed(42)", ">>> f_797(['apple banana'], 1, ['apple', 'banana', 'cherry'])", "['banana apple apple apple cherry cherry cherry apple_banana apple']", ">>> f_797(['Alice Charlie', 'ALICE BOB', 'aLiCe dAn'], 1, ['alice', 'bob', 'charlie', 'dan'])", "['alice_charlie alice alice_charlie charlie alice_charlie dan alice']"]}} +{"task_id": "f_814", "prompt": "import os\nfrom pathlib import Path\nfrom datetime import datetime, timezone\n\n\ndef f_814(directory_path: str):\n \"\"\"\n Analyzes a given directory, listing each file it contains along with its size,\n creation time, and last modification time without recursing into subdirectories.\n\n Args:\n - directory_path (str): The path to the directory to be analyzed.\n If it is empty, this function returns an empty list.\n\n Returns:\n - list of tuples: Each tuple contains (file name, file size in bytes,\n creation time in ISO format, modification time in ISO format).\n\n Raises:\n - ValueError: If the provided directory does not exist.\n\n Requirements:\n - os\n - pathlib\n - datetime\n\n Notes:\n - The function assumes the directory exists and contains only files (no\n subdirectories are processed).\n - Times are reported in system time, UTC.\n - The creation and modification times are platform dependent; on some systems,\n the creation time might not be available and might be replaced by the last\n metadata change time.\n\n Examples:\n >>> result = f_814('/path/to/directory')\n >>> print(result)\n [('example.txt', 1024, '2023-04-01T14:30:00Z', '2023-04-02T15:00:00Z'), ...]\n\n >>> result = f_814('/path/to/empty_directory')\n >>> print(result)\n []\n \"\"\"", "canonical_solution": " if not Path(directory_path).is_dir():\n raise ValueError(f\"The path {directory_path} is not a valid directory.\")\n\n file_details = []\n for entry in os.scandir(directory_path):\n if entry.is_file():\n file_info = os.stat(entry.path)\n file_size = file_info.st_size\n creation_time = datetime.fromtimestamp(\n file_info.st_ctime, timezone.utc\n ).isoformat()\n modification_time = datetime.fromtimestamp(\n file_info.st_mtime, timezone.utc\n ).isoformat()\n file_details.append(\n (entry.name, file_size, creation_time, modification_time)\n )\n\n return file_details", "test": "import unittest\nimport tempfile\nimport os\nfrom datetime import datetime, timezone, timedelta\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Set up a 'before' time with leeway for testing file modification times\n self.before_creation = datetime.now(timezone.utc) - timedelta(seconds=1)\n # Setup a temporary directory\n self.test_dir = tempfile.TemporaryDirectory()\n # Create test files\n self.files = {\n \"empty.txt\": 0,\n \"small.txt\": 5,\n \"medium.txt\": 50,\n \"large.txt\": 500,\n \"utc_test.txt\": 10,\n }\n for file_name, size in self.files.items():\n path = os.path.join(self.test_dir.name, file_name)\n with open(path, \"wb\") as f:\n f.write(os.urandom(size))\n def tearDown(self):\n # Cleanup the directory after tests\n self.test_dir.cleanup()\n def test_case_1(self):\n # Test the function on an existing directory.\n result = f_814(self.test_dir.name)\n self.assertEqual(len(result), len(self.files))\n def test_case_2(self):\n # Test the function with a non-existing directory.\n with self.assertRaises(ValueError):\n f_814(\"/path/to/non/existing/directory\")\n def test_case_3(self):\n # Test the function with an empty directory.\n with tempfile.TemporaryDirectory() as empty_dir:\n result = f_814(empty_dir)\n self.assertEqual(len(result), 0)\n def test_case_4(self):\n # Test if the function correctly identifies file sizes.\n result = f_814(self.test_dir.name)\n sizes = {file[0]: file[1] for file in result}\n for file_name, size in self.files.items():\n self.assertEqual(sizes[file_name], size)\n def test_case_5(self):\n # Test if the function lists all expected files, regardless of order.\n result = f_814(self.test_dir.name)\n file_names = sorted([file[0] for file in result])\n expected_file_names = sorted(\n list(self.files.keys())\n ) # Assuming 'utc_test.txt' is expected.\n self.assertListEqual(file_names, expected_file_names)\n def test_case_6(self):\n # Test if modification times are correctly identified.\n result = f_814(self.test_dir.name)\n # Check if modification times are reasonable (not testing specific times because of system differences)\n for _, _, creation_time, modification_time in result:\n creation_datetime = datetime.fromisoformat(creation_time)\n modification_datetime = datetime.fromisoformat(modification_time)\n self.assertTrue(creation_datetime <= modification_datetime)\n def test_case_7(self):\n # Test that the function ignores directories.\n sub_dir_path = os.path.join(self.test_dir.name, \"subdir\")\n os.mkdir(sub_dir_path)\n # Add a file inside the sub-directory to ensure it's not empty\n with open(os.path.join(sub_dir_path, \"file.txt\"), \"w\") as sub_file:\n sub_file.write(\"This is a test.\")\n result = f_814(self.test_dir.name)\n self.assertEqual(\n len(result), len(self.files)\n ) # Should not count the subdir or its contents\n def test_case_8(self):\n # Test if file names are correctly identified.\n result = f_814(self.test_dir.name)\n names = [file[0] for file in result]\n for name in self.files.keys():\n self.assertIn(name, names)\n def test_case_9(self):\n # Test that a non-directory path raises a ValueError.\n with tempfile.NamedTemporaryFile() as tmpfile:\n with self.assertRaises(ValueError):\n f_814(tmpfile.name)\n def test_case_10(self):\n # Test timestamps are in UTC and within a reasonable accuracy window.\n self.after_creation = datetime.now(timezone.utc)\n result = f_814(self.test_dir.name)\n for _, _, creation_time, modification_time in result:\n creation_dt = datetime.fromisoformat(creation_time)\n modification_dt = datetime.fromisoformat(modification_time)\n # Ensure the timestamps are in UTC\n self.assertEqual(creation_dt.tzinfo, timezone.utc)\n self.assertEqual(modification_dt.tzinfo, timezone.utc)\n # Ensure timestamps are within a reasonable window\n self.assertTrue(self.before_creation <= creation_dt <= self.after_creation)\n self.assertTrue(\n self.before_creation <= modification_dt <= self.after_creation\n )", "apis": ["os.scandir", "os.stat", "datetime.datetime.fromtimestamp", "pathlib.Path", "datetime.timezone.utc"], "libs": ["os", "pathlib", "datetime"], "doc": {"description": ["Analyzes a given directory, listing each file it contains along with its size,", "creation time, and last modification time without recursing into subdirectories.", "Args:", "- directory_path (str): The path to the directory to be analyzed.", "If it is empty, this function returns an empty list.", "Notes:", "- The function assumes the directory exists and contains only files (no", "subdirectories are processed).", "- Times are reported in system time, UTC.", "- The creation and modification times are platform dependent; on some systems,", "the creation time might not be available and might be replaced by the last", "metadata change time.", ">>> result = f_814('/path/to/empty_directory')", ">>> print(result)", "[]"], "note": [], "params": [], "returns": ["list of tuples: Each tuple contains (file name, file size in bytes,", "creation time in ISO format, modification time in ISO format)."], "reqs": ["os", "pathlib", "datetime"], "raises": ["ValueError: If the provided directory does not exist."], "example": ["Examples:", ">>> result = f_814('/path/to/directory')", ">>> print(result)", "[('example.txt', 1024, '2023-04-01T14:30:00Z', '2023-04-02T15:00:00Z'), ...]"]}} +{"task_id": "f_835", "prompt": "import sys\nimport sqlite3\n\n# Constants\nPATH_TO_APPEND = \"path/to/whatever\"\nDATABASE = \"path/to/database.db\"\n\n\ndef f_835(path_to_append=PATH_TO_APPEND, database=DATABASE):\n \"\"\"\n This function appends a given path to sys.path and updates an SQLite database with the path, \n creating the table if needed and avoiding duplicates.\n\n Parameters:\n - path_to_append (str): A file system path to be appended to sys.path and inserted\n into the SQLite database. Defaults to 'path/to/whatever' if not specified.\n - database (str): The file system path to the SQLite database file. Defaults to\n 'path/to/database.db' if not provided. The function interacts with this database\n to store the path.\n\n Returns:\n - str: The path that was appended to sys.path and inserted into the database.\n\n Requirements:\n - sys\n - sqlite3\n\n\n Examples:\n >>> f_835('path/to/new_directory', 'path/to/new_database.db')\n 'path/to/new_directory'\n >>> f_835()\n 'path/to/whatever'\n \"\"\"", "canonical_solution": " sys.path.append(path_to_append)\n\n conn = sqlite3.connect(database)\n cur = conn.cursor()\n cur.execute(\"CREATE TABLE IF NOT EXISTS paths (path TEXT UNIQUE)\")\n cur.execute(\"INSERT OR IGNORE INTO paths (path) VALUES (?)\", (path_to_append,))\n conn.commit()\n conn.close()\n\n return path_to_append", "test": "import unittest\nimport sqlite3\nimport os\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_835\"\"\"\n def setUp(self):\n path_to_create = os.path.dirname(PATH_TO_APPEND)\n os.makedirs(path_to_create, exist_ok=True)\n self.test_db = DATABASE\n def test_basic_path_insertion(self):\n \"\"\"Test the function when a path is provided.\"\"\"\n test_path = \"path/to/test/path\"\n result = f_835(test_path, self.test_db)\n self.assertEqual(result, test_path)\n # Check the database to ensure the path was saved\n conn = sqlite3.connect(self.test_db)\n cur = conn.cursor()\n cur.execute(\"SELECT * FROM paths WHERE path=?\", (test_path,))\n fetched_path = cur.fetchone()\n conn.close()\n self.assertIsNotNone(fetched_path)\n self.assertEqual(fetched_path[0], test_path)\n def test_existing_path(self):\n \"\"\"Test the function when an existing path is provided.\"\"\"\n # Insert an existing path\n existing_path = \"existing/path\"\n f_835(existing_path, self.test_db)\n # Attempt to insert the same path again\n result = f_835(existing_path, self.test_db)\n self.assertEqual(result, existing_path)\n # Check the database to ensure there's only one entry for the existing path\n conn = sqlite3.connect(self.test_db)\n cur = conn.cursor()\n cur.execute(\"SELECT COUNT(*) FROM paths WHERE path=?\", (existing_path,))\n count = cur.fetchone()[0]\n conn.close()\n self.assertEqual(count, 1)\n def test_multiple_paths(self):\n \"\"\"Test the function when multiple paths are provided.\"\"\"\n paths = [\"path1\", \"path2\", \"path3\"]\n for path in paths:\n result = f_835(path, self.test_db)\n self.assertEqual(result, path)\n # Check the database to ensure all paths are saved\n conn = sqlite3.connect(self.test_db)\n cur = conn.cursor()\n cur.execute(\"SELECT COUNT(*) FROM paths\")\n count = cur.fetchone()[0]\n conn.close()\n self.assertEqual(count, len(paths))\n def test_database_creation(self):\n \"\"\"Test the function when the database doesn't exist.\"\"\"\n new_db = \"path/to/new_test_database.db\"\n test_path = \"path/to/new\"\n os.makedirs(os.path.dirname(test_path), exist_ok=True)\n result = f_835(test_path, new_db)\n self.assertEqual(result, test_path)\n # Check the new database to ensure the path was saved\n conn = sqlite3.connect(new_db)\n cur = conn.cursor()\n cur.execute(\"SELECT * FROM paths WHERE path=?\", (test_path,))\n fetched_path = cur.fetchone()\n conn.close()\n self.assertIsNotNone(fetched_path)\n self.assertEqual(fetched_path[0], test_path)\n def test_invalid_database(self):\n \"\"\"Test the function when an invalid database is provided.\"\"\"\n invalid_db = \"invalid/path/database.db\"\n test_path = \"test/path\"\n with self.assertRaises(sqlite3.OperationalError):\n f_835(test_path, invalid_db)\n def tearDown(self):\n # Cleanup the test databases\n dbs_to_remove = [\"path/to/database.db\", \"path/to/new_test_database.db\"]\n for db in dbs_to_remove:\n if os.path.exists(db):\n os.remove(db)\n # Cleanup the test directories\n dirs_to_remove = [\"path/to/whatever\", \"path/to\", \"path\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["sys.path.append", "sqlite3.connect", "sys.path"], "libs": ["sys", "sqlite3"], "doc": {"description": ["This function appends a given path to sys.path and updates an SQLite database with the path,", "creating the table if needed and avoiding duplicates."], "note": [], "params": ["path_to_append (str): A file system path to be appended to sys.path and inserted", "into the SQLite database. Defaults to 'path/to/whatever' if not specified.", "database (str): The file system path to the SQLite database file. Defaults to", "'path/to/database.db' if not provided. The function interacts with this database", "to store the path."], "returns": ["str: The path that was appended to sys.path and inserted into the database."], "reqs": ["sys", "sqlite3"], "raises": [], "example": ["Examples:", ">>> f_835('path/to/new_directory', 'path/to/new_database.db')", "'path/to/new_directory'", ">>> f_835()", "'path/to/whatever'"]}} {"task_id": "f_531", "prompt": "from itertools import combinations\nimport math\n\ndef f_531(x, w):\n \"\"\"\n Find the continuous substring of x, which has the maximum total weight, given a dictionary where the keys are characters and the values are their weights.\n\n Parameters:\n - x (str): The input string.\n - w (dict): The dictionary of character weights.\n\n Returns:\n - max_substr (str): The continuous substring with the highest weight.\n\n Requirements:\n - itertools\n - math\n\n Example:\n >>> f_531('c', {'a': 1, 'b': 2, 'c': 3})\n 'c'\n >>> f_531('abc', {'a': 10, 'b': -5, 'c': 3})\n 'a'\n \"\"\"", "canonical_solution": " max_weight = -math.inf\n max_substr = ''\n\n for start, end in combinations(range(len(x) + 1), 2):\n substr = x[start:end]\n weight = sum(w.get(c, 0) for c in substr)\n if weight > max_weight:\n max_weight = weight\n max_substr = substr\n\n return max_substr", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_531('c', {'a': 1, 'b': 2, 'c': 3}), 'c')\n \n def test_case_2(self):\n self.assertEqual(f_531('aabc', {'a': 10, 'b': -5, 'c': 3}), 'aa')\n def test_case_3(self):\n self.assertEqual(f_531('aabc', {'a': 10, 'b': -2, 'c': 3}), 'aabc')\n def test_case_4(self):\n self.assertEqual(f_531('aabc', {'a': 2, 'b': -5, 'c': 3}), 'aa')\n \n def test_case_5(self):\n self.assertEqual(f_531('aabc', {'a': 0, 'b': -1, 'c': 1}), 'c')", "apis": ["itertools.combinations", "math.inf"], "libs": ["itertools", "math"], "doc": {"description": ["Find the continuous substring of x, which has the maximum total weight, given a dictionary where the keys are characters and the values are their weights."], "note": [], "params": ["x (str): The input string.", "w (dict): The dictionary of character weights."], "returns": ["max_substr (str): The continuous substring with the highest weight."], "reqs": ["itertools", "math"], "raises": [], "example": [">>> f_531('c', {'a': 1, 'b': 2, 'c': 3})", "'c'", ">>> f_531('abc', {'a': 10, 'b': -5, 'c': 3})", "'a'"]}} -{"task_id": "f_598", "prompt": "import json\nimport numpy as np\n\ndef f_598(df):\n \"\"\"\n Given a DataFrame with random values and an 'IntCol' column, transform the 'IntCol' column by a logarithm (base 10) and write it to a `IntCol.json` file as a list. Also return the DataFrame.\n\n Parameters:\n - df (DataFrame): A pandas DataFrame with a 'IntCol' column.\n\n Returns:\n - df (DataFrame): A pandas DataFrame to describe the transformed data.\n\n Requirements:\n - json\n - pandas\n - numpy\n - os\n\n Example:\n >>> df = pd.DataFrame({'IntCol': [10, 100, 1000, 10000, 100000]})\n >>> df_transformed = f_598(df)\n >>> print(df_transformed)\n IntCol\n 0 1.0\n 1 2.0\n 2 3.0\n 3 4.0\n 4 5.0\n\n \"\"\"", "canonical_solution": " df['IntCol'] = np.log10(df['IntCol'])\n\n # Convert 'IntCol' column to a list and write it to a JSON file\n int_col_list = df['IntCol'].tolist()\n with open('IntCol.json', 'w') as json_file:\n json.dump(int_col_list, json_file)\n\n return df", "test": "import unittest\nimport os\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n if os.path.exists('IntCol.json'):\n os.remove('IntCol.json')\n \n def test_case_1(self):\n df = pd.DataFrame({'IntCol': [10, 100, 1000, 10000, 100000]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [1, 2, 3, 4, 5]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [1, 2, 3, 4, 5]))\n def test_case_2(self):\n df = pd.DataFrame({'IntCol': [10000000, 100000000, 1000000000, 10000000000, 100000000000]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [7, 8, 9, 10, 11]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [7, 8, 9, 10, 11]))\n def test_case_3(self):\n df = pd.DataFrame({'IntCol': [0, 0, 0, 0, 0]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [-np.inf, -np.inf, -np.inf, -np.inf, -np.inf]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [-np.inf, -np.inf, -np.inf, -np.inf, -np.inf]))\n def test_case_4(self):\n df = pd.DataFrame({'IntCol': [10000000]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [7]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [7]))\n def test_case_5(self):\n df = pd.DataFrame({'IntCol': [1, 10, 100, 1000, 10000, 100000]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [0, 1, 2, 3, 4, 5]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [0, 1, 2, 3, 4, 5]))", "apis": ["json.dump", "numpy.log10"], "libs": ["numpy", "json"], "doc": {"description": ["Given a DataFrame with random values and an 'IntCol' column, transform the 'IntCol' column by a logarithm (base 10) and write it to a `IntCol.json` file as a list. Also return the DataFrame."], "note": [], "params": ["df (DataFrame): A pandas DataFrame with a 'IntCol' column."], "returns": ["df (DataFrame): A pandas DataFrame to describe the transformed data."], "reqs": ["json", "pandas", "numpy", "os"], "raises": [], "example": [">>> df = pd.DataFrame({'IntCol': [10, 100, 1000, 10000, 100000]})", ">>> df_transformed = f_598(df)", ">>> print(df_transformed)", "IntCol", "0 1.0", "1 2.0", "2 3.0", "3 4.0", "4 5.0"]}} -{"task_id": "f_544", "prompt": "import math\nimport yaml\n\ndef f_544(yaml_path, key):\n \"\"\"\n Read a YAML file, apply the cosine to a specific key from the data, and then write the modified data back into the YAML file.\n \n Parameters:\n - yaml_path (str): The path to the YAML file.\n - key (str): The key to take the cosine of.\n \n Returns:\n - data (dict): A dictionary representation of the modified YAML data.\n\n Requirements:\n - math\n - yaml\n \n Example:\n >>> yaml_data = f_544('data.yaml', 'ele')\n \"\"\"", "canonical_solution": " with open(yaml_path, 'r') as file:\n data = yaml.safe_load(file)\n\n if key in data:\n data[key] = math.cos(data[key])\n\n with open(yaml_path, 'w') as file:\n yaml.safe_dump(data, file)\n\n return data", "test": "import unittest\nimport os\nclass TestCases(unittest.TestCase):\n def base(self, yaml_path, key, contents, expected):\n # Create YAML file\n with open(yaml_path, 'w') as file:\n yaml.safe_dump(contents, file)\n # Run function\n data = f_544(yaml_path, key)\n # Check data\n self.assertEqual(data, expected)\n # Remove YAML file\n os.remove(yaml_path)\n def test_case_1(self):\n self.base('./data.yaml', 'ele', {'ele': 1, 'ale': 2, 'ile': 3}, {'ele': math.cos(1), 'ale': 2, 'ile': 3})\n def test_case_2(self):\n self.base('./y.yaml', 'zzz', {'zzz': 1, 'yyy': 2, 'xxx': 3}, {'zzz': math.cos(1), 'yyy': 2, 'xxx': 3})\n def test_case_3(self):\n self.base('./data.yaml', 'ale', {'ele': 1, 'ale': 2, 'ile': 3}, {'ele': 1, 'ale': math.cos(2), 'ile': 3})\n def test_case_4(self):\n self.base('./y.yaml', 'yyy', {'zzz': 1, 'yyy': 2, 'xxx': 3}, {'zzz': 1, 'yyy': math.cos(2), 'xxx': 3})\n def test_case_5(self):\n self.base('./data.yaml', 'ile', {'ele': 1, 'ale': 2, 'ile': 3}, {'ele': 1, 'ale': 2, 'ile': math.cos(3)})", "apis": ["yaml.safe_load", "yaml.safe_dump", "math.cos"], "libs": ["yaml", "math"], "doc": {"description": ["Read a YAML file, apply the cosine to a specific key from the data, and then write the modified data back into the YAML file."], "note": [], "params": ["yaml_path (str): The path to the YAML file.", "key (str): The key to take the cosine of."], "returns": ["data (dict): A dictionary representation of the modified YAML data."], "reqs": ["math", "yaml"], "raises": [], "example": [">>> yaml_data = f_544('data.yaml', 'ele')"]}} -{"task_id": "f_549", "prompt": "import numpy as np\nfrom sklearn.preprocessing import OneHotEncoder\n\ndef f_549(list_of_lists):\n \"\"\"\n Merges a predefined set of lists into a list and one-hot-encodes the elements of the list.\n\n Parameters:\n - list_of_lists (list): The list to be processed.\n\n Returns:\n - one_hot (numpy.array): The one-hot encoding of the merged list.\n\n Requirements:\n - numpy\n - scikit-learn\n\n Example:\n >>> f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],\n [0., 1., 0., 0., 0., 0., 0., 0., 0.],\n [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n [0., 0., 0., 1., 0., 0., 0., 0., 0.],\n [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n [0., 0., 0., 0., 0., 0., 1., 0., 0.],\n [0., 0., 0., 0., 0., 0., 0., 1., 0.],\n [0., 0., 0., 0., 0., 0., 0., 0., 1.]])\n \"\"\"", "canonical_solution": " merged_list = np.array([item for sublist in list_of_lists for item in sublist]).reshape(-1, 1)\n encoder = OneHotEncoder(sparse=False)\n one_hot = encoder.fit_transform(merged_list)\n return one_hot", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).shape, (9, 9))\n def test_case_2(self):\n arr = f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n self.assertTrue(np.all(arr.sum(axis=0) == 1))\n self.assertTrue(np.all(arr.sum(axis=1) == 1))\n self.assertTrue(np.all(arr >= 0))\n def test_case_3(self):\n arr = f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n self.assertEqual(arr[0, 0], 1)\n self.assertEqual(arr[1, 1], 1)\n self.assertEqual(arr[2, 2], 1)\n self.assertEqual(arr[3, 3], 1)\n self.assertEqual(arr[4, 4], 1)\n self.assertEqual(arr[5, 5], 1)\n self.assertEqual(arr[6, 6], 1)\n self.assertEqual(arr[7, 7], 1)\n self.assertEqual(arr[8, 8], 1)\n \n def test_case_4(self):\n arr = f_549([[1, 1, 1], [2, 2, 2], [3, 3, 3]])\n self.assertEqual(arr[0, 0], 1)\n self.assertEqual(arr[1, 0], 1)\n self.assertEqual(arr[2, 0], 1)\n self.assertEqual(arr[3, 1], 1)\n self.assertEqual(arr[4, 1], 1)\n self.assertEqual(arr[5, 1], 1)\n self.assertEqual(arr[6, 2], 1)\n self.assertEqual(arr[7, 2], 1)\n self.assertEqual(arr[8, 2], 1)\n def test_case_5(self):\n arr = f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n self.assertEqual(arr[0, 0], 1)\n self.assertEqual(arr[1, 1], 1)\n self.assertEqual(arr[2, 2], 1)\n self.assertEqual(arr[3, 3], 1)\n self.assertEqual(arr[4, 4], 1)\n self.assertEqual(arr[5, 5], 1)\n self.assertEqual(arr[6, 6], 1)\n self.assertEqual(arr[7, 7], 1)\n self.assertEqual(arr[8, 8], 1)", "apis": ["numpy.array", "sklearn.preprocessing.OneHotEncoder"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Merges a predefined set of lists into a list and one-hot-encodes the elements of the list."], "note": [], "params": ["list_of_lists (list): The list to be processed."], "returns": ["one_hot (numpy.array): The one-hot encoding of the merged list."], "reqs": ["numpy", "scikit-learn"], "raises": [], "example": [">>> f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", "array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],", "[0., 1., 0., 0., 0., 0., 0., 0., 0.],", "[0., 0., 1., 0., 0., 0., 0., 0., 0.],", "[0., 0., 0., 1., 0., 0., 0., 0., 0.],", "[0., 0., 0., 0., 1., 0., 0., 0., 0.],", "[0., 0., 0., 0., 0., 1., 0., 0., 0.],", "[0., 0., 0., 0., 0., 0., 1., 0., 0.],", "[0., 0., 0., 0., 0., 0., 0., 1., 0.],", "[0., 0., 0., 0., 0., 0., 0., 0., 1.]])"]}} -{"task_id": "f_739", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport random\n\n# Constants\nNUMBERS = list(range(1, 7)) # Adjusting for dice rolls (1 to 6)\n\ndef f_739(rolls, seed=None):\n \"\"\"\n Simulate a number of dice rolls, calculate the frequency of each result, and return both the frequency array and a histogram of the results.\n\n Note:\n The dice rolls have 6 possible outcomes.\n The title of the histogram is \"Histogram of Dice Rolls\".\n The x-axis is labeled \"Dice Value\" and the y-axis is labeled \"Frequency\".\n \n Parameters:\n rolls (int): The number of dice rolls.\n\n Returns:\n tuple: A tuple containing:\n - np.array: A numpy array with the frequency of each outcome.\n - matplotlib.Axes: Axes object representing the histogram.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - random\n\n Examples:\n >>> import random\n >>> random.seed(0)\n >>> outcomes, ax = f_739(10000)\n >>> print(outcomes)\n [1656 1690 1696 1657 1632 1669]\n >>> plt.show()\n >>> random.seed(10)\n >>> outcomes, ax = f_739(100)\n >>> print(outcomes)\n [15 21 17 22 16 9]\n >>> plt.show()\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n \n outcomes = [random.choice(NUMBERS) for _ in range(rolls)]\n frequencies = np.bincount(outcomes, minlength=7)[1:] # Excluding 0 as dice starts from 1\n\n # Creating histogram\n fig, ax = plt.subplots()\n ax.hist(outcomes, bins=np.arange(1, 7+1.5)-0.5, edgecolor='black')\n ax.set_title('Histogram of Dice Rolls')\n ax.set_xlabel('Dice Value')\n ax.set_ylabel('Frequency')\n\n return frequencies, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n outcomes, ax = f_739(100, seed=1)\n self.assertEqual(len(outcomes), 6)\n self.assertEqual(sum(outcomes), 100)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n def test_case_2(self):\n outcomes, ax = f_739(0, seed=2)\n self.assertEqual(outcomes.tolist(), [0, 0, 0, 0, 0, 0])\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n def test_case_3(self):\n outcomes, ax = f_739(100000, seed=3)\n self.assertEqual(outcomes.tolist(), [16607, 16689, 16800, 16625, 16640, 16639])\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n \n def test_case_4(self):\n outcomes, ax = f_739(1, seed=4)\n self.assertEqual(outcomes.tolist(), [0, 1, 0, 0, 0, 0])\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n \n def test_case_5(self):\n outcomes, ax = f_739(10, seed=5)\n self.assertEqual(sum(outcomes), 10)\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')", "apis": ["numpy.arange", "numpy.bincount", "random.seed", "random.choice", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "random"], "doc": {"description": ["Simulate a number of dice rolls, calculate the frequency of each result, and return both the frequency array and a histogram of the results."], "note": ["The dice rolls have 6 possible outcomes.", "The title of the histogram is \"Histogram of Dice Rolls\".", "The x-axis is labeled \"Dice Value\" and the y-axis is labeled \"Frequency\"."], "params": ["rolls (int): The number of dice rolls."], "returns": ["tuple: A tuple containing:", "np.array: A numpy array with the frequency of each outcome.", "matplotlib.Axes: Axes object representing the histogram."], "reqs": ["numpy", "matplotlib.pyplot", "random"], "raises": [], "example": ["Examples:", ">>> import random", ">>> random.seed(0)", ">>> outcomes, ax = f_739(10000)", ">>> print(outcomes)", "[1656 1690 1696 1657 1632 1669]", ">>> plt.show()", ">>> random.seed(10)", ">>> outcomes, ax = f_739(100)", ">>> print(outcomes)", "[15 21 17 22 16 9]", ">>> plt.show()"]}} -{"task_id": "f_782", "prompt": "import re\nimport os\nimport glob\n\ndef f_782(dir_path: str) -> list:\n \"\"\"\n Rename all files in the specified directory by removing all special characters,\n punctuation marks, and spaces, using regular expressions. The function keeps\n alphanumeric characters and removes the rest.\n\n Requirements:\n - re\n - os\n - glob\n\n Parameters:\n dir_path (str): The path to the directory containing the files to be renamed.\n\n Returns:\n list[str]: A list containing the new names of all files after renaming.\n\n Example:\n >>> f_782('path/to/directory')\n ['file1', 'file2', 'file3']\n >>> f_782('another/directory/path')\n ['anotherFile1', 'anotherFile2']\n \"\"\"", "canonical_solution": " new_names = []\n for file_path in glob.glob(os.path.join(dir_path, '*')):\n base_name = os.path.basename(file_path)\n new_name = re.sub('[^A-Za-z0-9]+', '', base_name)\n new_path = os.path.join(dir_path, new_name)\n os.rename(file_path, new_path)\n new_names.append(new_name)\n return new_names", "test": "import unittest\nfrom pathlib import Path\nimport shutil\nclass TestRenameFiles(unittest.TestCase):\n \n def setUp(self):\n self.temp_dir = Path(\"temp_test_dir\")\n self.temp_dir.mkdir(parents=True, exist_ok=True)\n \n def tearDown(self):\n shutil.rmtree(self.temp_dir)\n \n def test_special_characters_removal(self):\n test_files = [\"file@1.txt\", \"file_#2.txt\", \"file$ 3.txt\"]\n for file_name in test_files:\n (self.temp_dir / file_name).touch()\n \n expected_names = [\"file1txt\", \"file2txt\", \"file3txt\"]\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(sorted(new_file_names), sorted(expected_names))\n \n def test_alphanumeric_names(self):\n test_files = [\"file1.txt\", \"file2.txt\", \"file3.txt\"]\n for file_name in test_files:\n (self.temp_dir / file_name).touch()\n \n expected_names = [\"file1txt\", \"file2txt\", \"file3txt\"]\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(sorted(new_file_names), sorted(expected_names))\n \n def test_empty_directory(self):\n expected_names = []\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(new_file_names, expected_names)\n \n def test_only_special_characters(self):\n test_files = [\"@@@.txt\", \"###.txt\", \"$$$ .txt\"]\n for file_name in test_files:\n (self.temp_dir / file_name).touch()\n \n expected_names = [\"txt\", \"txt\", \"txt\"]\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(sorted(new_file_names), sorted(expected_names))\n \n def test_mixed_characters(self):\n test_files = [\"f@ile_1.txt\", \"file# 2.txt\", \"fi$le 3.txt\"]\n for file_name in test_files:\n (self.temp_dir / file_name).touch()\n \n expected_names = [\"file1txt\", \"file2txt\", \"file3txt\"]\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(sorted(new_file_names), sorted(expected_names))", "apis": ["os.path.basename", "os.rename", "re.sub", "os.path", "os.path.join", "glob.glob"], "libs": ["glob", "re", "os"], "doc": {"description": ["Rename all files in the specified directory by removing all special characters,", "punctuation marks, and spaces, using regular expressions. The function keeps", "alphanumeric characters and removes the rest."], "note": [], "params": ["dir_path (str): The path to the directory containing the files to be renamed."], "returns": ["list[str]: A list containing the new names of all files after renaming."], "reqs": ["re", "os", "glob"], "raises": [], "example": [">>> f_782('path/to/directory')", "['file1', 'file2', 'file3']", ">>> f_782('another/directory/path')", "['anotherFile1', 'anotherFile2']"]}} -{"task_id": "f_336", "prompt": "import pandas as pd\nfrom sklearn.feature_selection import SelectKBest, f_classif\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_336(df1, df2):\n \"\"\"Perform the feature selection with SelectKBest (k=2) and return a heatmap of the feature correlations.\n\n Parameters:\n - df1 (pd.DataFrame): The dataframe containing features.\n - df2 (pd.DataFrame): The dataframe containing the target variable. Must have an 'id' column corresponding to df1.\n\n Returns:\n - tuple: A tuple containing:\n - list: A list of the selected features.\n - AxesSubplot: A heatmap showing the correlation between the selected features.\n\n Requirements:\n - pandas\n - sklearn.feature_selection.SelectKBest\n - sklearn.feature_selection.f_classif\n - seaborn\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7], 'feature3': [3.4, 5.6, 7.8]})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'target': [4.5, 6.7, 8.9]})\n >>> selected_features, heatmap = f_336(df1, df2)\n >>> heatmap\n \n >>> selected_features\n ['feature2', 'feature3']\n \"\"\"", "canonical_solution": " # Merge dataframes based on 'id'\n df = pd.merge(df1, df2, on=\"id\")\n\n # Separate features and target\n features = df1.columns.drop(\"id\")\n X = df[features]\n y = df[\"target\"]\n\n # Select top 2 features\n selector = SelectKBest(f_classif, k=2)\n X_new = selector.fit_transform(X, y)\n\n selected_features = [x for x, y in zip(features, selector.get_support()) if y]\n\n # Draw heatmap\n heatmap = sns.heatmap(\n pd.DataFrame(X_new, columns=selected_features).corr(), annot=True\n )\n\n return selected_features, heatmap", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n plt.close(\"all\")\n def test_case_1(self):\n # Dataset with clear distinction between features\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3, 4, 5],\n \"feature1\": [5.5, 6.7, 7.8, 8.9, 9.0],\n \"feature2\": [1.1, 2.2, 3.3, 4.4, 5.5],\n \"feature3\": [0.5, 1.5, 2.5, 3.5, 4.5],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3, 4, 5], \"target\": [1, 0, 1, 0, 1]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature1\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_2(self):\n # Dataset with features having moderate correlation\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1.2, 3.4, 5.6],\n \"feature2\": [2.3, 4.5, 6.7],\n \"feature3\": [3.4, 5.6, 7.8],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [4.5, 6.7, 8.9]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature2\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_3(self):\n # Dataset with balanced target values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3, 4],\n \"feature1\": [2.5, 3.5, 4.5, 5.5],\n \"feature2\": [6.6, 7.7, 8.8, 9.9],\n \"feature3\": [10.1, 11.1, 12.1, 13.1],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3, 4], \"target\": [0, 1, 0, 1]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature2\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_4(self):\n # Smaller dataset\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2],\n \"feature1\": [3.3, 4.4],\n \"feature2\": [5.5, 6.6],\n \"feature3\": [7.7, 8.8],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2], \"target\": [1, 0]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature2\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_5(self):\n # Dataset with different feature correlations\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [10, 20, 30],\n \"feature2\": [40, 50, 60],\n \"feature3\": [70, 80, 90],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [1, 0, 1]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature2\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_6(self):\n # Test handling errors - no \"id\"\n df1 = pd.DataFrame(\n {\n \"feature1\": [10, 20, 30],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [1, 0, 1]})\n with self.assertRaises(KeyError):\n f_336(df1, df2)\n def test_case_7(self):\n # Test handling errors - wrong types\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [\"a\", \"b\", 3],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [1, 0, 1]})\n with self.assertRaises(ValueError):\n f_336(df1, df2)", "apis": ["pandas.DataFrame", "seaborn.heatmap", "sklearn.feature_selection.SelectKBest", "pandas.merge"], "libs": ["pandas", "sklearn", "seaborn"], "doc": {"description": ["Perform the feature selection with SelectKBest (k=2) and return a heatmap of the feature correlations."], "note": [], "params": ["df1 (pd.DataFrame): The dataframe containing features.", "df2 (pd.DataFrame): The dataframe containing the target variable. Must have an 'id' column corresponding to df1."], "returns": ["tuple: A tuple containing:", "list: A list of the selected features.", "AxesSubplot: A heatmap showing the correlation between the selected features."], "reqs": ["pandas", "sklearn.feature_selection.SelectKBest", "sklearn.feature_selection.f_classif", "seaborn"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7], 'feature3': [3.4, 5.6, 7.8]})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'target': [4.5, 6.7, 8.9]})", ">>> selected_features, heatmap = f_336(df1, df2)", ">>> heatmap", "", ">>> selected_features", "['feature2', 'feature3']"]}} -{"task_id": "f_743", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\n\n# Updated function to handle empty input list\ndef f_743(d):\n \"\"\"\n Scale all values with the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d\" with MinMaxScaler.\n\n Parameters:\n d (list): A list of dictionaries.\n\n Returns:\n DataFrame: A pandas DataFrame with scaled values.\n\n Requirements:\n - pandas\n - sklearn.preprocessing.MinMaxScaler\n\n Example usage:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n >>> print(f_743(data))\n x y z\n 0 0.0 0.642857 0.0\n 1 1.0 1.000000 0.5\n 2 0.5 0.000000 1.0\n\n >>> data = [{'x': -1, 'y': 0, 'z': 5}, {'x': 3, 'y': -15, 'z': 0}, {'x': 0, 'y': 1, 'z': -7}]\n >>> print(f_743(data))\n x y z\n 0 0.00 0.9375 1.000000\n 1 1.00 0.0000 0.583333\n 2 0.25 1.0000 0.000000\n \"\"\"", "canonical_solution": " if not d: # Check if the input list is empty\n return pd.DataFrame(columns=['x', 'y', 'z']) # Return an empty DataFrame with specified columns\n \n df = pd.DataFrame(d)\n scaler = MinMaxScaler()\n scaled_df = pd.DataFrame(scaler.fit_transform(df[['x', 'y', 'z']]), columns=['x', 'y', 'z'])\n\n return scaled_df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n result = f_743(data)\n expected_df = pd.DataFrame({'x': [0.0, 1.0, 0.5], 'y': [0.642857, 1.0, 0.0], 'z': [0.0, 0.5, 1.0]})\n pd.testing.assert_frame_equal(result, expected_df)\n \n def test_case_2(self):\n data = [{'x': -1, 'y': 0, 'z': 5}, {'x': 3, 'y': -15, 'z': 0}, {'x': 0, 'y': 1, 'z': -7}]\n result = f_743(data)\n expected_df = pd.DataFrame({'x': [0.0, 1.0, 0.25], 'y': [0.9375, 0.0, 1.0], 'z': [1.0, 0.583333, 0.0]})\n pd.testing.assert_frame_equal(result, expected_df)\n \n def test_case_3(self):\n data = []\n result = f_743(data)\n expected_df = pd.DataFrame(columns=['x', 'y', 'z'])\n pd.testing.assert_frame_equal(result, expected_df)\n \n def test_case_4(self):\n data = [{'x': 1}, {'y': 2}, {'z': 3}]\n result = f_743(data)\n expected_df = pd.DataFrame({'x': [0.0, None, None], 'y': [None, 0.0, None], 'z': [None, None, 0.0]})\n pd.testing.assert_frame_equal(result, expected_df)\n \n def test_case_5(self):\n data = [{'x': 1, 'y': 2}, {'x': 3, 'z': 4}]\n result = f_743(data)\n expected_df = pd.DataFrame({'x': [0.0, 1.0], 'y': [0.0, None], 'z': [None, 0.0]})\n pd.testing.assert_frame_equal(result, expected_df)", "apis": ["sklearn.preprocessing.MinMaxScaler", "pandas.DataFrame"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Scale all values with the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d\" with MinMaxScaler.", "Example usage:", ">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]", ">>> print(f_743(data))", "x y z", "0 0.0 0.642857 0.0", "1 1.0 1.000000 0.5", "2 0.5 0.000000 1.0", ">>> data = [{'x': -1, 'y': 0, 'z': 5}, {'x': 3, 'y': -15, 'z': 0}, {'x': 0, 'y': 1, 'z': -7}]", ">>> print(f_743(data))", "x y z", "0 0.00 0.9375 1.000000", "1 1.00 0.0000 0.583333", "2 0.25 1.0000 0.000000"], "note": [], "params": ["d (list): A list of dictionaries."], "returns": ["DataFrame: A pandas DataFrame with scaled values."], "reqs": ["pandas", "sklearn.preprocessing.MinMaxScaler"], "raises": [], "example": []}} -{"task_id": "f_409", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_409(data_list):\n \"\"\"\n Visualizes the scores of students over multiple tests using a line plot.\n\n The function takes in a list of dictionaries. Each dictionary contains the name of a student (key)\n and their score (value). It combines these dictionaries into a pandas DataFrame and plots a line graph\n of student scores over tests, where the x-axis represents the test number and the y-axis represents the score.\n Each student's scores are plotted as separate lines. Missing scores are handled by not plotting\n those specific data points, allowing for discontinuous lines where data is missing.\n\n Parameters:\n - data_list (list of dict): A list of dictionaries with student names as keys and their scores as values.\n\n Returns:\n - ax (matplotlib.axes._axes.Axes): The Axes object with the plotted data.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> data = [{'John': 5, 'Jane': 10}, {'John': 6, 'Jane': 8}, {'John': 5, 'Jane': 9}]\n >>> ax = f_409(data)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-0.25, 0, '\u22120.25'), Text(0.0, 0, '0.00'), Text(0.25, 0, '0.25'), Text(0.5, 0, '0.50'), Text(0.75, 0, '0.75'), Text(1.0, 0, '1.00'), Text(1.25, 0, '1.25'), Text(1.5, 0, '1.50'), Text(1.75, 0, '1.75'), Text(2.0, 0, '2.00'), Text(2.25, 0, '2.25')]\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data_list)\n fig, ax = plt.subplots()\n for column in df:\n ax.plot(df[column], label=column)\n ax.set_title(\"Student Scores over Tests\")\n ax.set_xlabel(\"Test Number\")\n ax.set_ylabel(\"Score\")\n\n return ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n data = [\n {\"John\": 5, \"Jane\": 10, \"Joe\": 7},\n {\"John\": 6, \"Jane\": 8, \"Joe\": 10},\n {\"John\": 5, \"Jane\": 9, \"Joe\": 8},\n {\"John\": 7, \"Jane\": 10, \"Joe\": 9},\n ]\n self.validate_plot(data)\n def test_case_2(self):\n data = [{\"John\": 3}, {\"John\": 4}, {\"John\": 5}, {\"John\": 6}]\n self.validate_plot(data)\n def test_case_3(self):\n data = [\n {\"John\": 3, \"Jane\": 2},\n {\"John\": 4, \"Jane\": 3},\n {\"John\": 5, \"Jane\": 4},\n {\"John\": 6, \"Jane\": 5},\n ]\n self.validate_plot(data)\n def test_case_4(self):\n data = [\n {\"John\": 10, \"Jane\": 20, \"Joe\": 15, \"Jack\": 25},\n {\"John\": 12, \"Jane\": 18, \"Joe\": 14, \"Jack\": 24},\n {\"John\": 11, \"Jane\": 19, \"Joe\": 13, \"Jack\": 23},\n {\"John\": 13, \"Jane\": 21, \"Joe\": 16, \"Jack\": 22},\n ]\n self.validate_plot(data)\n def test_case_5(self):\n data = [\n {\"John\": 7, \"Jane\": 8},\n {\"John\": 8, \"Jane\": 7},\n {\"John\": 7, \"Jane\": 8},\n {\"John\": 8, \"Jane\": 7},\n ]\n self.validate_plot(data)\n def test_case_6(self):\n data = []\n self.validate_plot(data)\n def test_case_7(self):\n # Floats\n data = [{\"John\": 5.5, \"Jane\": 10.1}, {\"John\": 6.75, \"Jane\": 8.25}]\n self.validate_plot(data)\n def test_case_8(self):\n # Missing scores\n data = [{\"John\": 5, \"Jane\": 10}, {\"Jane\": 8, \"Joe\": 7}, {\"John\": 6}]\n self.validate_plot(data)\n def validate_plot(self, data):\n ax = f_409(data)\n self.assertIsInstance(ax, plt.Axes)\n df = pd.DataFrame(data)\n for idx, column in enumerate(df):\n plotted_data_y = ax.lines[idx].get_ydata()\n expected_data_y = df[column].values.astype(float)\n # Handle float comparisons\n np.testing.assert_allclose(\n plotted_data_y, expected_data_y, rtol=1e-5, atol=1e-8, equal_nan=True\n )\n plotted_data_x = ax.lines[idx].get_xdata().astype(int)\n expected_data_x = np.arange(len(df[column].values))\n self.assertTrue(\n np.array_equal(plotted_data_x, expected_data_x),\n msg=f\"X-data Mismatch for {column}. Plotted: {plotted_data_x}, Expected: {expected_data_x}\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Visualizes the scores of students over multiple tests using a line plot.", "The function takes in a list of dictionaries. Each dictionary contains the name of a student (key)", "and their score (value). It combines these dictionaries into a pandas DataFrame and plots a line graph", "of student scores over tests, where the x-axis represents the test number and the y-axis represents the score.", "Each student's scores are plotted as separate lines. Missing scores are handled by not plotting", "those specific data points, allowing for discontinuous lines where data is missing."], "note": [], "params": ["data_list (list of dict): A list of dictionaries with student names as keys and their scores as values."], "returns": ["ax (matplotlib.axes._axes.Axes): The Axes object with the plotted data."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [{'John': 5, 'Jane': 10}, {'John': 6, 'Jane': 8}, {'John': 5, 'Jane': 9}]", ">>> ax = f_409(data)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-0.25, 0, '\u22120.25'), Text(0.0, 0, '0.00'), Text(0.25, 0, '0.25'), Text(0.5, 0, '0.50'), Text(0.75, 0, '0.75'), Text(1.0, 0, '1.00'), Text(1.25, 0, '1.25'), Text(1.5, 0, '1.50'), Text(1.75, 0, '1.75'), Text(2.0, 0, '2.00'), Text(2.25, 0, '2.25')]"]}} -{"task_id": "f_857", "prompt": "import requests\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_857(api_url):\n \"\"\"\n Fetches data from a specified API, processes the JSON response, converts it into a pandas DataFrame,\n and plots the data using matplotlib.\n If the data is empty, no plot is generated. If the API request fails, it raises an HTTPError.\n The function also checks if the provided API URL is a string.\n\n Parameters:\n - api_url (str): The URL of the API to fetch data from.\n\n Returns:\n - DataFrame: A pandas DataFrame with the parsed data from the API.\n - AxesSubplot or None: A matplotlib AxesSubplot object representing the plot of the data, or None if the data is empty.\n\n Raises:\n - HTTPError: If the API request fails due to issues like network problems, invalid response, etc.\n - TypeError: If the `api_url` is not a string.\n\n Requirements:\n - requests\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> df, plot = f_857(\"https://api.example.com/data\")\n >>> df.head()\n >>> if plot:\n >>> plot.show()\n \"\"\"", "canonical_solution": " # Send the GET request and handle API failure\n if not isinstance(api_url, str):\n raise TypeError(\"api_url must be a string\")\n\n response = requests.get(api_url, timeout=5)\n response.raise_for_status()\n\n # Parse the JSON response and convert it to a pandas DataFrame\n data = response.json()\n df = pd.DataFrame(data)\n\n # Generate a plot if the DataFrame is not empty\n plot = df.plot() if not df.empty else None\n\n return df, plot", "test": "import unittest\nfrom unittest.mock import patch, Mock\nimport pandas as pd\nimport matplotlib.pyplot as plt\nAPI_URL = \"https://api.example.com/data\"\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n @patch(\"requests.get\")\n def test_successful_api_call_with_data(self, mock_get):\n \"\"\"Test the function with a successful API call returning non-empty data.\"\"\"\n mock_get.return_value = Mock(status_code=200, json=lambda: [{\"a\": 1, \"b\": 2}])\n df, plot = f_857(\"http://example.com/api\")\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIsInstance(plot, plt.Axes)\n @patch(\"requests.get\")\n def test_successful_api_call_with_empty_data(self, mock_get):\n \"\"\"Test the function with a successful API call returning empty data.\"\"\"\n mock_get.return_value = Mock(status_code=200, json=lambda: [])\n df, plot = f_857(\"http://example.com/api\")\n self.assertIsInstance(df, pd.DataFrame)\n self.assertTrue(df.empty)\n self.assertIsNone(plot)\n @patch(\"requests.get\")\n def test_api_call_with_invalid_json(self, mock_get):\n \"\"\"Test the function with an API call returning invalid JSON.\"\"\"\n mock_get.return_value = Mock(\n status_code=200, json=lambda: Exception(\"Invalid JSON\")\n )\n with self.assertRaises(Exception):\n f_857(\"http://example.com/api\")\n @patch(\"requests.get\")\n def test_api_call_with_http_error(self, mock_get):\n \"\"\"Test the function with an API call that raises an HTTP error.\"\"\"\n mock_get.side_effect = requests.HTTPError()\n with self.assertRaises(requests.HTTPError):\n f_857(\"http://example.com/api\")\n def test_incorrect_url_type(self):\n \"\"\"Test the function with an incorrect type for the URL.\"\"\"\n with self.assertRaises(TypeError):\n f_857(123)\n def tearDown(self):\n plt.close()", "apis": ["pandas.DataFrame", "requests.get"], "libs": ["pandas", "requests"], "doc": {"description": ["Fetches data from a specified API, processes the JSON response, converts it into a pandas DataFrame,", "and plots the data using matplotlib.", "If the data is empty, no plot is generated. If the API request fails, it raises an HTTPError.", "The function also checks if the provided API URL is a string."], "note": [], "params": ["api_url (str): The URL of the API to fetch data from."], "returns": ["DataFrame: A pandas DataFrame with the parsed data from the API.", "AxesSubplot or None: A matplotlib AxesSubplot object representing the plot of the data, or None if the data is empty."], "reqs": ["requests", "pandas", "matplotlib.pyplot"], "raises": ["HTTPError: If the API request fails due to issues like network problems, invalid response, etc.", "TypeError: If the `api_url` is not a string."], "example": [">>> df, plot = f_857(\"https://api.example.com/data\")", ">>> df.head()", ">>> if plot:", ">>> plot.show()"]}} -{"task_id": "f_546", "prompt": "from collections import Counter\nfrom itertools import chain\n\ndef f_546(list_of_lists):\n \"\"\"\n Merge all sublists from a list of lists into a list and return a count of the elements.\n \n Parameters:\n - list_of_lists (list): The list to be processed.\n\n Returns:\n - collections.Counter: Counter object with the counts of the elements in the merged list.\n\n Requirements:\n - itertools\n - collections\n \n Example:\n >>> f_546([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1})\n \"\"\"", "canonical_solution": " merged_list = list(chain.from_iterable(list_of_lists))\n return Counter(merged_list)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}))\n def test_case_2(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 2, 2: 2, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}))\n def test_case_3(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2], [1, 2, 3, 4, 5, 6, 7, 8, 9]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 3, 2: 3, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2, 8: 2, 9: 2}))\n def test_case_4(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2], [1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 4, 2: 4, 3: 3, 4: 2, 5: 2, 6: 2, 7: 2, 8: 2, 9: 2}))\n def test_case_5(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2], [1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 5, 2: 5, 3: 4, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3, 9: 3}))", "apis": ["itertools.chain.from_iterable", "collections.Counter"], "libs": ["collections", "itertools"], "doc": {"description": ["Merge all sublists from a list of lists into a list and return a count of the elements."], "note": [], "params": ["list_of_lists (list): The list to be processed."], "returns": ["collections.Counter: Counter object with the counts of the elements in the merged list."], "reqs": ["itertools", "collections"], "raises": [], "example": [">>> f_546([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", "Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1})"]}} -{"task_id": "f_767", "prompt": "import pandas as pd\nimport numpy as np\n\ndef f_767(data_size=1000, column_names=['A', 'B', 'C', 'D', 'E'], seed=0):\n \"\"\"\n Generate a Pandas DataFrame with random numeric values between 1 and 100, inclusive, and replace all occurrences of values less than 10 with -1.\n \n Requirements:\n - pandas\n - numpy\n \n Parameters:\n - data_size (int, optional): The number of rows in the DataFrame. Defaults to 1000.\n - column_names (list of str, optional): Names of the DataFrame columns. Defaults to ['A', 'B', 'C', 'D', 'E'].\n\n Returns:\n - DataFrame: The modified Pandas DataFrame.\n \"\"\"", "canonical_solution": " np.random.seed(seed)\n df = pd.DataFrame(np.random.randint(1, 101, size=(data_size, len(column_names))), columns=column_names)\n df[df < 10] = -1 # Correctly replace values less than 10 with -1\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_default_parameters(self):\n df = f_767(seed=42)\n self.assertEqual(df.shape, (1000, 5))\n # Check that there are no values < 10 except -1\n condition = ((df >= 10) | (df == -1)).all().all()\n self.assertTrue(condition, \"DataFrame contains values less than 10 that were not replaced with -1\")\n def test_custom_data_size_and_columns(self):\n df = f_767(data_size=10, column_names=['X', 'Y'], seed=55)\n self.assertEqual(df.shape, (10, 2))\n # Check that there are no values < 10 except -1\n condition = ((df >= 10) | (df == -1)).all().all()\n self.assertTrue(condition, \"DataFrame contains values less than 10 that were not replaced with -1\")\n def test_correct_replacement_of_values(self):\n df = f_767(data_size=100, seed=0)\n self.assertTrue(((df >= 10) | (df == -1)).all().all(), \"Not all values less than 10 were replaced with -1\")\n \n def test_correct_dataframe_dimensions(self):\n rows, columns = 50, 3\n df = f_767(data_size=rows, column_names=['P', 'Q', 'R'], seed=1)\n self.assertEqual(df.shape, (rows, columns), \"DataFrame dimensions are incorrect\")\n \n def test_with_minimum_data_size(self):\n df = f_767(data_size=1, column_names=['Single'], seed=2)\n self.assertEqual(df.shape, (1, 1), \"DataFrame does not handle minimum data size correctly\")", "apis": ["numpy.random", "numpy.random.randint", "pandas.DataFrame", "numpy.random.seed"], "libs": ["numpy", "pandas"], "doc": {"description": ["Generate a Pandas DataFrame with random numeric values between 1 and 100, inclusive, and replace all occurrences of values less than 10 with -1."], "note": [], "params": ["data_size (int, optional): The number of rows in the DataFrame. Defaults to 1000.", "column_names (list of str, optional): Names of the DataFrame columns. Defaults to ['A', 'B', 'C', 'D', 'E']."], "returns": ["DataFrame: The modified Pandas DataFrame."], "reqs": ["pandas", "numpy"], "raises": [], "example": []}} -{"task_id": "f_375", "prompt": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_375(N=100, CATEGORIES=[\"A\", \"B\", \"C\", \"D\", \"E\"], seed=42):\n \"\"\"\n Create a DataFrame with a given number of rows (N) and 3 columns: \"x\" and \"y\" with random values,\n and \"category\" with random categories from a given CATEGORIES list. Each category is guaranteed to\n appear at least once if N is greater than or equal to the number of categories, otherwise it is\n randomly sampled without replacement from CATEGORIES. Finally, draw a scatter plot of \"x\" vs \"y,\"\n colored by \"category\".\n\n Parameters:\n - N (int, optional): Number of rows for the DataFrame. Defaults to 100.\n - CATEGORIES (list, optional): List of categories. Defaults to ['A', 'B', 'C', 'D', 'E'].\n - seed (int, optional): Random seed for reproducibility. Defaults to 42.\n\n Returns:\n tuple: A tuple containing:\n - DataFrame: The generated DataFrame.\n - Axes: The Axes object of the scatter plot.\n\n Requirements:\n - numpy\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> df, ax = f_375()\n >>> df.head()\n x y category\n 0 0.239562 0.385098 C\n 1 0.144895 0.851137 D\n 2 0.489453 0.316922 C\n 3 0.985650 0.169493 E\n 4 0.242055 0.556801 A\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " np.random.seed(seed)\n\n if N < len(CATEGORIES):\n all_categories = np.random.choice(CATEGORIES, N, replace=False)\n else:\n guaranteed_categories = np.array(CATEGORIES)\n remaining_categories = np.random.choice(CATEGORIES, N - len(CATEGORIES))\n all_categories = np.concatenate([guaranteed_categories, remaining_categories])\n np.random.shuffle(all_categories)\n\n df = pd.DataFrame(\n {\"x\": np.random.rand(N), \"y\": np.random.rand(N), \"category\": all_categories}\n )\n\n fig, ax = plt.subplots()\n for category in CATEGORIES:\n ax.scatter(\n df[df[\"category\"] == category][\"x\"],\n df[df[\"category\"] == category][\"y\"],\n label=category,\n )\n\n return df, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test default parameter\n df, ax = f_375()\n self.assertEqual(df.shape, (100, 3))\n self.assertSetEqual(set(df[\"category\"]), {\"A\", \"B\", \"C\", \"D\", \"E\"})\n self.assertListEqual(list(df.columns), [\"x\", \"y\", \"category\"])\n self.assertTrue(df[\"x\"].between(0, 1).all())\n self.assertTrue(df[\"y\"].between(0, 1).all())\n self.assertIsInstance(ax, plt.Axes)\n def test_case_2(self):\n # Test custom parameters\n df, ax = f_375(N=50, CATEGORIES=[\"X\", \"Y\"])\n self.assertEqual(df.shape, (50, 3))\n self.assertSetEqual(set(df[\"category\"]), {\"X\", \"Y\"})\n self.assertListEqual(list(df.columns), [\"x\", \"y\", \"category\"])\n self.assertTrue(df[\"x\"].between(0, 1).all())\n self.assertTrue(df[\"y\"].between(0, 1).all())\n self.assertIsInstance(ax, plt.Axes)\n def test_case_3(self):\n # Test N specifically\n for N in [5, 10, 50, 200]:\n df, _ = f_375(N=N)\n self.assertEqual(df.shape, (N, 3))\n def test_case_4(self):\n # Test categories specifically\n for C in [[\"APPLE\", \"BANANA\"], [\"carrot\", \"dragonfruit\", \"eggplant\"], [\"F\"]]:\n df, _ = f_375(CATEGORIES=C)\n self.assertSetEqual(set(df[\"category\"]), set(C))\n def test_case_5(self):\n # Test random seed\n df1, _ = f_375(seed=0)\n df2, _ = f_375(seed=0)\n df3, _ = f_375(seed=1)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertFalse(df1.equals(df3))\n def test_case_6(self):\n # Test handling empty dataframe\n df, _ = f_375(N=0, CATEGORIES=[])\n self.assertEqual(df.shape, (0, 3))\n self.assertListEqual(list(df[\"category\"]), [])\n def test_case_7(self):\n # Test handing more categories than data points\n df, _ = f_375(N=3, CATEGORIES=[\"A\", \"B\", \"C\", \"D\"])\n self.assertEqual(len(df), 3)\n self.assertEqual(len(set(df[\"category\"])), 3)\n def test_case_8(self):\n # Test single category\n df, _ = f_375(N=50, CATEGORIES=[\"X\"])\n self.assertTrue((df[\"category\"] == \"X\").all())\n def test_case_9(self):\n # Test other category types\n df, _ = f_375(N=50, CATEGORIES=[1, 2, 3])\n self.assertSetEqual(set(df[\"category\"]), {1, 2, 3})\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.random", "numpy.random.seed", "numpy.random.shuffle", "numpy.random.choice", "numpy.random.rand", "numpy.array", "pandas.DataFrame", "matplotlib.pyplot.subplots", "numpy.concatenate"], "libs": ["numpy", "pandas", "matplotlib"], "doc": {"description": ["Create a DataFrame with a given number of rows (N) and 3 columns: \"x\" and \"y\" with random values,", "and \"category\" with random categories from a given CATEGORIES list. Each category is guaranteed to", "appear at least once if N is greater than or equal to the number of categories, otherwise it is", "randomly sampled without replacement from CATEGORIES. Finally, draw a scatter plot of \"x\" vs \"y,\"", "colored by \"category\"."], "note": [], "params": ["N (int, optional): Number of rows for the DataFrame. Defaults to 100.", "CATEGORIES (list, optional): List of categories. Defaults to ['A', 'B', 'C', 'D', 'E'].", "seed (int, optional): Random seed for reproducibility. Defaults to 42."], "returns": ["tuple: A tuple containing:", "DataFrame: The generated DataFrame.", "Axes: The Axes object of the scatter plot."], "reqs": ["numpy", "pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> df, ax = f_375()", ">>> df.head()", "x y category", "0 0.239562 0.385098 C", "1 0.144895 0.851137 D", "2 0.489453 0.316922 C", "3 0.985650 0.169493 E", "4 0.242055 0.556801 A", ">>> type(ax)", ""]}} -{"task_id": "f_747", "prompt": "import os\nimport glob\nimport csv\n\ndef f_747(directory_path, file_extension='.csv'):\n \"\"\"\n Reads all files with a specified extension in a given directory and returns their data in a dictionary.\n\n Functionality:\n - Reads all files with the specified extension in the given directory.\n - Uses the filename without the extension as a key in the output dictionary.\n - The value for each key is a list of rows from the file, where each row is represented as a list of values.\n\n Input:\n - directory_path (str): The path to the directory containing the files.\n - file_extension (str, optional): The file extension to look for. Default is '.csv'.\n\n Output:\n - Returns a dictionary where each key is the filename (without extension) and the value is a list of rows from the file.\n\n Requirements:\n - os\n - glob\n - csv\n\n Example:\n >>> data = f_747('/home/user/data')\n >>> print(data['file1'])\n [['header1', 'header2'], ['row1_col1', 'row1_col2'], ['row2_col1', 'row2_col2']]\n \n >>> data = f_747('/home/user/data', '.txt')\n >>> print(data)\n {}\n \"\"\"", "canonical_solution": " data = {}\n\n for file in glob.glob(os.path.join(directory_path, '*' + file_extension)):\n filename = os.path.splitext(os.path.basename(file))[0]\n with open(file, 'r') as f:\n reader = csv.reader(f)\n data[filename] = list(reader)\n\n return data", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # create a directory with test files\n os.mkdir('test_1')\n with open('test_1/file1.csv', 'w', newline='') as f:\n writer = csv.writer(f)\n writer.writerows([['header1', 'header2'], ['row1_col1', 'row1_col2'], ['row2_col1', 'row2_col2']])\n os.mkdir('test_2')\n with open('test_2/file2.csv', 'w', newline='') as f:\n writer = csv.writer(f)\n writer.writerows([['name', 'age'], ['Alice', '30'], ['Bob', '40']])\n os.mkdir('test_5')\n with open('test_5/file3.csv', 'w', newline='') as f:\n writer = csv.writer(f)\n writer.writerows([['subject', 'marks'], ['Math', '90'], ['Science', '85']])\n def tearDown(self):\n # remove the test directories\n shutil.rmtree('test_1')\n shutil.rmtree('test_2')\n shutil.rmtree('test_5')\n \n def test_case_1(self):\n # This test assumes the existence of a directory named 'f_747_data_wenhao' with a CSV file 'file1.csv'\n data = f_747('test_1')\n self.assertIsInstance(data, dict)\n self.assertIn('file1', data)\n self.assertEqual(data['file1'], [['header1', 'header2'], ['row1_col1', 'row1_col2'], ['row2_col1', 'row2_col2']])\n def test_case_2(self):\n # This test checks explicit file_extension input\n data = f_747('test_2', '.csv')\n self.assertIsInstance(data, dict)\n self.assertIn('file2', data)\n self.assertEqual(data['file2'], [['name', 'age'], ['Alice', '30'], ['Bob', '40']])\n def test_case_3(self):\n # This test checks for a non-existent file extension, expecting an empty dictionary\n data = f_747('test_3', '.txt')\n self.assertIsInstance(data, dict)\n self.assertEqual(len(data), 0)\n def test_case_4(self):\n # This test checks for a non-existent directory, expecting an empty dictionary\n data = f_747('/nonexistent/directory')\n self.assertIsInstance(data, dict)\n self.assertEqual(len(data), 0)\n def test_case_5(self):\n # This test checks another file's presence and content in the dictionary\n data = f_747('test_5')\n self.assertIsInstance(data, dict)\n self.assertIn('file3', data)\n self.assertEqual(data['file3'], [['subject', 'marks'], ['Math', '90'], ['Science', '85']])", "apis": ["os.path.splitext", "csv.reader", "os.path.basename", "os.path", "os.path.join", "glob.glob"], "libs": ["csv", "glob", "os"], "doc": {"description": ["Reads all files with a specified extension in a given directory and returns their data in a dictionary.", "Functionality:", "- Reads all files with the specified extension in the given directory.", "- Uses the filename without the extension as a key in the output dictionary.", "- The value for each key is a list of rows from the file, where each row is represented as a list of values.", "Input:", "- directory_path (str): The path to the directory containing the files.", "- file_extension (str, optional): The file extension to look for. Default is '.csv'.", "Output:", "- Returns a dictionary where each key is the filename (without extension) and the value is a list of rows from the file.", ">>> data = f_747('/home/user/data', '.txt')", ">>> print(data)", "{}"], "note": [], "params": [], "returns": [], "reqs": ["os", "glob", "csv"], "raises": [], "example": [">>> data = f_747('/home/user/data')", ">>> print(data['file1'])", "[['header1', 'header2'], ['row1_col1', 'row1_col2'], ['row2_col1', 'row2_col2']]"]}} +{"task_id": "f_598", "prompt": "import json\nimport numpy as np\n\ndef f_598(df):\n \"\"\"\n Given a DataFrame with random values and an 'IntCol' column, transform the 'IntCol' column by a logarithm (base 10) and write it to a `IntCol.json` file as a list. Also return the DataFrame.\n\n Parameters:\n - df (DataFrame): A pandas DataFrame with a 'IntCol' column.\n\n Returns:\n - df (DataFrame): A pandas DataFrame to describe the transformed data.\n\n Requirements:\n - json\n - pandas\n - numpy\n - os\n\n Example:\n >>> df = pd.DataFrame({'IntCol': [10, 100, 1000, 10000, 100000]})\n >>> df_transformed = f_598(df)\n >>> print(df_transformed)\n IntCol\n 0 1.0\n 1 2.0\n 2 3.0\n 3 4.0\n 4 5.0\n\n \"\"\"", "canonical_solution": " df['IntCol'] = np.log10(df['IntCol'])\n\n # Convert 'IntCol' column to a list and write it to a JSON file\n int_col_list = df['IntCol'].tolist()\n with open('IntCol.json', 'w') as json_file:\n json.dump(int_col_list, json_file)\n\n return df", "test": "import unittest\nimport os\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n if os.path.exists('IntCol.json'):\n os.remove('IntCol.json')\n \n def test_case_1(self):\n df = pd.DataFrame({'IntCol': [10, 100, 1000, 10000, 100000]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [1, 2, 3, 4, 5]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [1, 2, 3, 4, 5]))\n def test_case_2(self):\n df = pd.DataFrame({'IntCol': [10000000, 100000000, 1000000000, 10000000000, 100000000000]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [7, 8, 9, 10, 11]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [7, 8, 9, 10, 11]))\n def test_case_3(self):\n df = pd.DataFrame({'IntCol': [0, 0, 0, 0, 0]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [-np.inf, -np.inf, -np.inf, -np.inf, -np.inf]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [-np.inf, -np.inf, -np.inf, -np.inf, -np.inf]))\n def test_case_4(self):\n df = pd.DataFrame({'IntCol': [10000000]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [7]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [7]))\n def test_case_5(self):\n df = pd.DataFrame({'IntCol': [1, 10, 100, 1000, 10000, 100000]})\n df_transformed = f_598(df)\n self.assertTrue(np.allclose(df_transformed['IntCol'], [0, 1, 2, 3, 4, 5]))\n # Check if JSON file exists\n self.assertTrue(os.path.exists('IntCol.json'))\n # Check the contents of the JSON file\n with open('IntCol.json', 'r') as json_file:\n int_col_data = json.load(json_file)\n self.assertTrue(np.allclose(int_col_data, [0, 1, 2, 3, 4, 5]))", "apis": ["json.dump", "numpy.log10"], "libs": ["json", "numpy"], "doc": {"description": ["Given a DataFrame with random values and an 'IntCol' column, transform the 'IntCol' column by a logarithm (base 10) and write it to a `IntCol.json` file as a list. Also return the DataFrame."], "note": [], "params": ["df (DataFrame): A pandas DataFrame with a 'IntCol' column."], "returns": ["df (DataFrame): A pandas DataFrame to describe the transformed data."], "reqs": ["json", "pandas", "numpy", "os"], "raises": [], "example": [">>> df = pd.DataFrame({'IntCol': [10, 100, 1000, 10000, 100000]})", ">>> df_transformed = f_598(df)", ">>> print(df_transformed)", "IntCol", "0 1.0", "1 2.0", "2 3.0", "3 4.0", "4 5.0"]}} +{"task_id": "f_544", "prompt": "import math\nimport yaml\n\ndef f_544(yaml_path, key):\n \"\"\"\n Read a YAML file, apply the cosine to a specific key from the data, and then write the modified data back into the YAML file.\n \n Parameters:\n - yaml_path (str): The path to the YAML file.\n - key (str): The key to take the cosine of.\n \n Returns:\n - data (dict): A dictionary representation of the modified YAML data.\n\n Requirements:\n - math\n - yaml\n \n Example:\n >>> yaml_data = f_544('data.yaml', 'ele')\n \"\"\"", "canonical_solution": " with open(yaml_path, 'r') as file:\n data = yaml.safe_load(file)\n\n if key in data:\n data[key] = math.cos(data[key])\n\n with open(yaml_path, 'w') as file:\n yaml.safe_dump(data, file)\n\n return data", "test": "import unittest\nimport os\nclass TestCases(unittest.TestCase):\n def base(self, yaml_path, key, contents, expected):\n # Create YAML file\n with open(yaml_path, 'w') as file:\n yaml.safe_dump(contents, file)\n # Run function\n data = f_544(yaml_path, key)\n # Check data\n self.assertEqual(data, expected)\n # Remove YAML file\n os.remove(yaml_path)\n def test_case_1(self):\n self.base('./data.yaml', 'ele', {'ele': 1, 'ale': 2, 'ile': 3}, {'ele': math.cos(1), 'ale': 2, 'ile': 3})\n def test_case_2(self):\n self.base('./y.yaml', 'zzz', {'zzz': 1, 'yyy': 2, 'xxx': 3}, {'zzz': math.cos(1), 'yyy': 2, 'xxx': 3})\n def test_case_3(self):\n self.base('./data.yaml', 'ale', {'ele': 1, 'ale': 2, 'ile': 3}, {'ele': 1, 'ale': math.cos(2), 'ile': 3})\n def test_case_4(self):\n self.base('./y.yaml', 'yyy', {'zzz': 1, 'yyy': 2, 'xxx': 3}, {'zzz': 1, 'yyy': math.cos(2), 'xxx': 3})\n def test_case_5(self):\n self.base('./data.yaml', 'ile', {'ele': 1, 'ale': 2, 'ile': 3}, {'ele': 1, 'ale': 2, 'ile': math.cos(3)})", "apis": ["yaml.safe_dump", "math.cos", "yaml.safe_load"], "libs": ["yaml", "math"], "doc": {"description": ["Read a YAML file, apply the cosine to a specific key from the data, and then write the modified data back into the YAML file."], "note": [], "params": ["yaml_path (str): The path to the YAML file.", "key (str): The key to take the cosine of."], "returns": ["data (dict): A dictionary representation of the modified YAML data."], "reqs": ["math", "yaml"], "raises": [], "example": [">>> yaml_data = f_544('data.yaml', 'ele')"]}} +{"task_id": "f_549", "prompt": "import numpy as np\nfrom sklearn.preprocessing import OneHotEncoder\n\ndef f_549(list_of_lists):\n \"\"\"\n Merges a predefined set of lists into a list and one-hot-encodes the elements of the list.\n\n Parameters:\n - list_of_lists (list): The list to be processed.\n\n Returns:\n - one_hot (numpy.array): The one-hot encoding of the merged list.\n\n Requirements:\n - numpy\n - scikit-learn\n\n Example:\n >>> f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],\n [0., 1., 0., 0., 0., 0., 0., 0., 0.],\n [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n [0., 0., 0., 1., 0., 0., 0., 0., 0.],\n [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n [0., 0., 0., 0., 0., 0., 1., 0., 0.],\n [0., 0., 0., 0., 0., 0., 0., 1., 0.],\n [0., 0., 0., 0., 0., 0., 0., 0., 1.]])\n \"\"\"", "canonical_solution": " merged_list = np.array([item for sublist in list_of_lists for item in sublist]).reshape(-1, 1)\n encoder = OneHotEncoder(sparse=False)\n one_hot = encoder.fit_transform(merged_list)\n return one_hot", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).shape, (9, 9))\n def test_case_2(self):\n arr = f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n self.assertTrue(np.all(arr.sum(axis=0) == 1))\n self.assertTrue(np.all(arr.sum(axis=1) == 1))\n self.assertTrue(np.all(arr >= 0))\n def test_case_3(self):\n arr = f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n self.assertEqual(arr[0, 0], 1)\n self.assertEqual(arr[1, 1], 1)\n self.assertEqual(arr[2, 2], 1)\n self.assertEqual(arr[3, 3], 1)\n self.assertEqual(arr[4, 4], 1)\n self.assertEqual(arr[5, 5], 1)\n self.assertEqual(arr[6, 6], 1)\n self.assertEqual(arr[7, 7], 1)\n self.assertEqual(arr[8, 8], 1)\n \n def test_case_4(self):\n arr = f_549([[1, 1, 1], [2, 2, 2], [3, 3, 3]])\n self.assertEqual(arr[0, 0], 1)\n self.assertEqual(arr[1, 0], 1)\n self.assertEqual(arr[2, 0], 1)\n self.assertEqual(arr[3, 1], 1)\n self.assertEqual(arr[4, 1], 1)\n self.assertEqual(arr[5, 1], 1)\n self.assertEqual(arr[6, 2], 1)\n self.assertEqual(arr[7, 2], 1)\n self.assertEqual(arr[8, 2], 1)\n def test_case_5(self):\n arr = f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n self.assertEqual(arr[0, 0], 1)\n self.assertEqual(arr[1, 1], 1)\n self.assertEqual(arr[2, 2], 1)\n self.assertEqual(arr[3, 3], 1)\n self.assertEqual(arr[4, 4], 1)\n self.assertEqual(arr[5, 5], 1)\n self.assertEqual(arr[6, 6], 1)\n self.assertEqual(arr[7, 7], 1)\n self.assertEqual(arr[8, 8], 1)", "apis": ["sklearn.preprocessing.OneHotEncoder", "numpy.array"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Merges a predefined set of lists into a list and one-hot-encodes the elements of the list."], "note": [], "params": ["list_of_lists (list): The list to be processed."], "returns": ["one_hot (numpy.array): The one-hot encoding of the merged list."], "reqs": ["numpy", "scikit-learn"], "raises": [], "example": [">>> f_549([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", "array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],", "[0., 1., 0., 0., 0., 0., 0., 0., 0.],", "[0., 0., 1., 0., 0., 0., 0., 0., 0.],", "[0., 0., 0., 1., 0., 0., 0., 0., 0.],", "[0., 0., 0., 0., 1., 0., 0., 0., 0.],", "[0., 0., 0., 0., 0., 1., 0., 0., 0.],", "[0., 0., 0., 0., 0., 0., 1., 0., 0.],", "[0., 0., 0., 0., 0., 0., 0., 1., 0.],", "[0., 0., 0., 0., 0., 0., 0., 0., 1.]])"]}} +{"task_id": "f_739", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport random\n\n# Constants\nNUMBERS = list(range(1, 7)) # Adjusting for dice rolls (1 to 6)\n\ndef f_739(rolls, seed=None):\n \"\"\"\n Simulate a number of dice rolls, calculate the frequency of each result, and return both the frequency array and a histogram of the results.\n\n Note:\n The dice rolls have 6 possible outcomes.\n The title of the histogram is \"Histogram of Dice Rolls\".\n The x-axis is labeled \"Dice Value\" and the y-axis is labeled \"Frequency\".\n \n Parameters:\n rolls (int): The number of dice rolls.\n\n Returns:\n tuple: A tuple containing:\n - np.array: A numpy array with the frequency of each outcome.\n - matplotlib.Axes: Axes object representing the histogram.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - random\n\n Examples:\n >>> import random\n >>> random.seed(0)\n >>> outcomes, ax = f_739(10000)\n >>> print(outcomes)\n [1656 1690 1696 1657 1632 1669]\n >>> plt.show()\n >>> random.seed(10)\n >>> outcomes, ax = f_739(100)\n >>> print(outcomes)\n [15 21 17 22 16 9]\n >>> plt.show()\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n \n outcomes = [random.choice(NUMBERS) for _ in range(rolls)]\n frequencies = np.bincount(outcomes, minlength=7)[1:] # Excluding 0 as dice starts from 1\n\n # Creating histogram\n fig, ax = plt.subplots()\n ax.hist(outcomes, bins=np.arange(1, 7+1.5)-0.5, edgecolor='black')\n ax.set_title('Histogram of Dice Rolls')\n ax.set_xlabel('Dice Value')\n ax.set_ylabel('Frequency')\n\n return frequencies, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n outcomes, ax = f_739(100, seed=1)\n self.assertEqual(len(outcomes), 6)\n self.assertEqual(sum(outcomes), 100)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n def test_case_2(self):\n outcomes, ax = f_739(0, seed=2)\n self.assertEqual(outcomes.tolist(), [0, 0, 0, 0, 0, 0])\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n def test_case_3(self):\n outcomes, ax = f_739(100000, seed=3)\n self.assertEqual(outcomes.tolist(), [16607, 16689, 16800, 16625, 16640, 16639])\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n \n def test_case_4(self):\n outcomes, ax = f_739(1, seed=4)\n self.assertEqual(outcomes.tolist(), [0, 1, 0, 0, 0, 0])\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')\n \n def test_case_5(self):\n outcomes, ax = f_739(10, seed=5)\n self.assertEqual(sum(outcomes), 10)\n self.assertEqual(ax.get_title(), 'Histogram of Dice Rolls')\n self.assertEqual(ax.get_xlabel(), 'Dice Value')\n self.assertEqual(ax.get_ylabel(), 'Frequency')", "apis": ["numpy.arange", "random.seed", "random.choice", "matplotlib.pyplot.subplots", "numpy.bincount"], "libs": ["random", "numpy", "matplotlib"], "doc": {"description": ["Simulate a number of dice rolls, calculate the frequency of each result, and return both the frequency array and a histogram of the results."], "note": ["The dice rolls have 6 possible outcomes.", "The title of the histogram is \"Histogram of Dice Rolls\".", "The x-axis is labeled \"Dice Value\" and the y-axis is labeled \"Frequency\"."], "params": ["rolls (int): The number of dice rolls."], "returns": ["tuple: A tuple containing:", "np.array: A numpy array with the frequency of each outcome.", "matplotlib.Axes: Axes object representing the histogram."], "reqs": ["numpy", "matplotlib.pyplot", "random"], "raises": [], "example": ["Examples:", ">>> import random", ">>> random.seed(0)", ">>> outcomes, ax = f_739(10000)", ">>> print(outcomes)", "[1656 1690 1696 1657 1632 1669]", ">>> plt.show()", ">>> random.seed(10)", ">>> outcomes, ax = f_739(100)", ">>> print(outcomes)", "[15 21 17 22 16 9]", ">>> plt.show()"]}} +{"task_id": "f_782", "prompt": "import re\nimport os\nimport glob\n\ndef f_782(dir_path: str) -> list:\n \"\"\"\n Rename all files in the specified directory by removing all special characters,\n punctuation marks, and spaces, using regular expressions. The function keeps\n alphanumeric characters and removes the rest.\n\n Requirements:\n - re\n - os\n - glob\n\n Parameters:\n dir_path (str): The path to the directory containing the files to be renamed.\n\n Returns:\n list[str]: A list containing the new names of all files after renaming.\n\n Example:\n >>> f_782('path/to/directory')\n ['file1', 'file2', 'file3']\n >>> f_782('another/directory/path')\n ['anotherFile1', 'anotherFile2']\n \"\"\"", "canonical_solution": " new_names = []\n for file_path in glob.glob(os.path.join(dir_path, '*')):\n base_name = os.path.basename(file_path)\n new_name = re.sub('[^A-Za-z0-9]+', '', base_name)\n new_path = os.path.join(dir_path, new_name)\n os.rename(file_path, new_path)\n new_names.append(new_name)\n return new_names", "test": "import unittest\nfrom pathlib import Path\nimport shutil\nclass TestRenameFiles(unittest.TestCase):\n \n def setUp(self):\n self.temp_dir = Path(\"temp_test_dir\")\n self.temp_dir.mkdir(parents=True, exist_ok=True)\n \n def tearDown(self):\n shutil.rmtree(self.temp_dir)\n \n def test_special_characters_removal(self):\n test_files = [\"file@1.txt\", \"file_#2.txt\", \"file$ 3.txt\"]\n for file_name in test_files:\n (self.temp_dir / file_name).touch()\n \n expected_names = [\"file1txt\", \"file2txt\", \"file3txt\"]\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(sorted(new_file_names), sorted(expected_names))\n \n def test_alphanumeric_names(self):\n test_files = [\"file1.txt\", \"file2.txt\", \"file3.txt\"]\n for file_name in test_files:\n (self.temp_dir / file_name).touch()\n \n expected_names = [\"file1txt\", \"file2txt\", \"file3txt\"]\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(sorted(new_file_names), sorted(expected_names))\n \n def test_empty_directory(self):\n expected_names = []\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(new_file_names, expected_names)\n \n def test_only_special_characters(self):\n test_files = [\"@@@.txt\", \"###.txt\", \"$$$ .txt\"]\n for file_name in test_files:\n (self.temp_dir / file_name).touch()\n \n expected_names = [\"txt\", \"txt\", \"txt\"]\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(sorted(new_file_names), sorted(expected_names))\n \n def test_mixed_characters(self):\n test_files = [\"f@ile_1.txt\", \"file# 2.txt\", \"fi$le 3.txt\"]\n for file_name in test_files:\n (self.temp_dir / file_name).touch()\n \n expected_names = [\"file1txt\", \"file2txt\", \"file3txt\"]\n new_file_names = f_782(str(self.temp_dir))\n \n self.assertListEqual(sorted(new_file_names), sorted(expected_names))", "apis": ["re.sub", "os.path", "os.rename", "os.path.join", "os.path.basename", "glob.glob"], "libs": ["re", "glob", "os"], "doc": {"description": ["Rename all files in the specified directory by removing all special characters,", "punctuation marks, and spaces, using regular expressions. The function keeps", "alphanumeric characters and removes the rest."], "note": [], "params": ["dir_path (str): The path to the directory containing the files to be renamed."], "returns": ["list[str]: A list containing the new names of all files after renaming."], "reqs": ["re", "os", "glob"], "raises": [], "example": [">>> f_782('path/to/directory')", "['file1', 'file2', 'file3']", ">>> f_782('another/directory/path')", "['anotherFile1', 'anotherFile2']"]}} +{"task_id": "f_336", "prompt": "import pandas as pd\nfrom sklearn.feature_selection import SelectKBest, f_classif\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n\ndef f_336(df1, df2):\n \"\"\"Perform the feature selection with SelectKBest (k=2) and return a heatmap of the feature correlations.\n\n Parameters:\n - df1 (pd.DataFrame): The dataframe containing features.\n - df2 (pd.DataFrame): The dataframe containing the target variable. Must have an 'id' column corresponding to df1.\n\n Returns:\n - tuple: A tuple containing:\n - list: A list of the selected features.\n - Axes: A heatmap showing the correlation between the selected features.\n\n Requirements:\n - pandas\n - sklearn.feature_selection.SelectKBest\n - sklearn.feature_selection.f_classif\n - seaborn\n\n Example:\n >>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7], 'feature3': [3.4, 5.6, 7.8]})\n >>> df2 = pd.DataFrame({'id': [1, 2, 3], 'target': [4.5, 6.7, 8.9]})\n >>> selected_features, heatmap = f_336(df1, df2)\n >>> heatmap\n \n >>> selected_features\n ['feature2', 'feature3']\n \"\"\"", "canonical_solution": " # Merge dataframes based on 'id'\n df = pd.merge(df1, df2, on=\"id\")\n\n # Separate features and target\n features = df1.columns.drop(\"id\")\n X = df[features]\n y = df[\"target\"]\n\n # Select top 2 features\n selector = SelectKBest(f_classif, k=2)\n X_new = selector.fit_transform(X, y)\n\n selected_features = [x for x, y in zip(features, selector.get_support()) if y]\n\n # Draw heatmap\n heatmap = sns.heatmap(\n pd.DataFrame(X_new, columns=selected_features).corr(), annot=True\n )\n\n return selected_features, heatmap", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n plt.close(\"all\")\n def test_case_1(self):\n # Dataset with clear distinction between features\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3, 4, 5],\n \"feature1\": [5.5, 6.7, 7.8, 8.9, 9.0],\n \"feature2\": [1.1, 2.2, 3.3, 4.4, 5.5],\n \"feature3\": [0.5, 1.5, 2.5, 3.5, 4.5],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3, 4, 5], \"target\": [1, 0, 1, 0, 1]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature1\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_2(self):\n # Dataset with features having moderate correlation\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [1.2, 3.4, 5.6],\n \"feature2\": [2.3, 4.5, 6.7],\n \"feature3\": [3.4, 5.6, 7.8],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [4.5, 6.7, 8.9]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature2\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_3(self):\n # Dataset with balanced target values\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3, 4],\n \"feature1\": [2.5, 3.5, 4.5, 5.5],\n \"feature2\": [6.6, 7.7, 8.8, 9.9],\n \"feature3\": [10.1, 11.1, 12.1, 13.1],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3, 4], \"target\": [0, 1, 0, 1]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature2\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_4(self):\n # Smaller dataset\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2],\n \"feature1\": [3.3, 4.4],\n \"feature2\": [5.5, 6.6],\n \"feature3\": [7.7, 8.8],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2], \"target\": [1, 0]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature2\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_5(self):\n # Dataset with different feature correlations\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [10, 20, 30],\n \"feature2\": [40, 50, 60],\n \"feature3\": [70, 80, 90],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [1, 0, 1]})\n # Calling the function and asserting results\n selected_features, ax = f_336(df1, df2)\n self.assertListEqual(selected_features, [\"feature2\", \"feature3\"])\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(ax.has_data())\n def test_case_6(self):\n # Test handling errors - no \"id\"\n df1 = pd.DataFrame(\n {\n \"feature1\": [10, 20, 30],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [1, 0, 1]})\n with self.assertRaises(KeyError):\n f_336(df1, df2)\n def test_case_7(self):\n # Test handling errors - wrong types\n df1 = pd.DataFrame(\n {\n \"id\": [1, 2, 3],\n \"feature1\": [\"a\", \"b\", 3],\n }\n )\n df2 = pd.DataFrame({\"id\": [1, 2, 3], \"target\": [1, 0, 1]})\n with self.assertRaises(ValueError):\n f_336(df1, df2)", "apis": ["seaborn.heatmap", "sklearn.feature_selection.SelectKBest", "pandas.DataFrame", "pandas.merge"], "libs": ["seaborn", "pandas", "sklearn"], "doc": {"description": ["Perform the feature selection with SelectKBest (k=2) and return a heatmap of the feature correlations."], "note": [], "params": ["df1 (pd.DataFrame): The dataframe containing features.", "df2 (pd.DataFrame): The dataframe containing the target variable. Must have an 'id' column corresponding to df1."], "returns": ["tuple: A tuple containing:", "list: A list of the selected features.", "Axes: A heatmap showing the correlation between the selected features."], "reqs": ["pandas", "sklearn.feature_selection.SelectKBest", "sklearn.feature_selection.f_classif", "seaborn"], "raises": [], "example": [">>> df1 = pd.DataFrame({'id': [1, 2, 3], 'feature1': [1.2, 3.4, 5.6], 'feature2': [2.3, 4.5, 6.7], 'feature3': [3.4, 5.6, 7.8]})", ">>> df2 = pd.DataFrame({'id': [1, 2, 3], 'target': [4.5, 6.7, 8.9]})", ">>> selected_features, heatmap = f_336(df1, df2)", ">>> heatmap", "", ">>> selected_features", "['feature2', 'feature3']"]}} +{"task_id": "f_743", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\n\n# Updated function to handle empty input list\ndef f_743(d):\n \"\"\"\n Scale all values with the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d\" with MinMaxScaler.\n\n Parameters:\n d (list): A list of dictionaries.\n\n Returns:\n DataFrame: A pandas DataFrame with scaled values.\n\n Requirements:\n - pandas\n - sklearn.preprocessing.MinMaxScaler\n\n Example usage:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n >>> print(f_743(data))\n x y z\n 0 0.0 0.642857 0.0\n 1 1.0 1.000000 0.5\n 2 0.5 0.000000 1.0\n\n >>> data = [{'x': -1, 'y': 0, 'z': 5}, {'x': 3, 'y': -15, 'z': 0}, {'x': 0, 'y': 1, 'z': -7}]\n >>> print(f_743(data))\n x y z\n 0 0.00 0.9375 1.000000\n 1 1.00 0.0000 0.583333\n 2 0.25 1.0000 0.000000\n \"\"\"", "canonical_solution": " if not d: # Check if the input list is empty\n return pd.DataFrame(columns=['x', 'y', 'z']) # Return an empty DataFrame with specified columns\n \n df = pd.DataFrame(d)\n scaler = MinMaxScaler()\n scaled_df = pd.DataFrame(scaler.fit_transform(df[['x', 'y', 'z']]), columns=['x', 'y', 'z'])\n\n return scaled_df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n result = f_743(data)\n expected_df = pd.DataFrame({'x': [0.0, 1.0, 0.5], 'y': [0.642857, 1.0, 0.0], 'z': [0.0, 0.5, 1.0]})\n pd.testing.assert_frame_equal(result, expected_df)\n \n def test_case_2(self):\n data = [{'x': -1, 'y': 0, 'z': 5}, {'x': 3, 'y': -15, 'z': 0}, {'x': 0, 'y': 1, 'z': -7}]\n result = f_743(data)\n expected_df = pd.DataFrame({'x': [0.0, 1.0, 0.25], 'y': [0.9375, 0.0, 1.0], 'z': [1.0, 0.583333, 0.0]})\n pd.testing.assert_frame_equal(result, expected_df)\n \n def test_case_3(self):\n data = []\n result = f_743(data)\n expected_df = pd.DataFrame(columns=['x', 'y', 'z'])\n pd.testing.assert_frame_equal(result, expected_df)\n \n def test_case_4(self):\n data = [{'x': 1}, {'y': 2}, {'z': 3}]\n result = f_743(data)\n expected_df = pd.DataFrame({'x': [0.0, None, None], 'y': [None, 0.0, None], 'z': [None, None, 0.0]})\n pd.testing.assert_frame_equal(result, expected_df)\n \n def test_case_5(self):\n data = [{'x': 1, 'y': 2}, {'x': 3, 'z': 4}]\n result = f_743(data)\n expected_df = pd.DataFrame({'x': [0.0, 1.0], 'y': [0.0, None], 'z': [None, 0.0]})\n pd.testing.assert_frame_equal(result, expected_df)", "apis": ["pandas.DataFrame", "sklearn.preprocessing.MinMaxScaler"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Scale all values with the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d\" with MinMaxScaler.", "Example usage:", ">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]", ">>> print(f_743(data))", "x y z", "0 0.0 0.642857 0.0", "1 1.0 1.000000 0.5", "2 0.5 0.000000 1.0", ">>> data = [{'x': -1, 'y': 0, 'z': 5}, {'x': 3, 'y': -15, 'z': 0}, {'x': 0, 'y': 1, 'z': -7}]", ">>> print(f_743(data))", "x y z", "0 0.00 0.9375 1.000000", "1 1.00 0.0000 0.583333", "2 0.25 1.0000 0.000000"], "note": [], "params": ["d (list): A list of dictionaries."], "returns": ["DataFrame: A pandas DataFrame with scaled values."], "reqs": ["pandas", "sklearn.preprocessing.MinMaxScaler"], "raises": [], "example": []}} +{"task_id": "f_409", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_409(data_list):\n \"\"\"\n Visualizes the scores of students over multiple tests using a line plot.\n\n The function takes in a list of dictionaries. Each dictionary contains the name of a student (key)\n and their score (value). It combines these dictionaries into a pandas DataFrame and plots a line graph\n of student scores over tests, where the x-axis represents the test number and the y-axis represents the score.\n Each student's scores are plotted as separate lines. Missing scores are handled by not plotting\n those specific data points, allowing for discontinuous lines where data is missing.\n\n Parameters:\n - data_list (list of dict): A list of dictionaries with student names as keys and their scores as values.\n\n Returns:\n - ax (matplotlib.axes._axes.Axes): The Axes object with the plotted data.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> data = [{'John': 5, 'Jane': 10}, {'John': 6, 'Jane': 8}, {'John': 5, 'Jane': 9}]\n >>> ax = f_409(data)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-0.25, 0, '\u22120.25'), Text(0.0, 0, '0.00'), Text(0.25, 0, '0.25'), Text(0.5, 0, '0.50'), Text(0.75, 0, '0.75'), Text(1.0, 0, '1.00'), Text(1.25, 0, '1.25'), Text(1.5, 0, '1.50'), Text(1.75, 0, '1.75'), Text(2.0, 0, '2.00'), Text(2.25, 0, '2.25')]\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data_list)\n fig, ax = plt.subplots()\n for column in df:\n ax.plot(df[column], label=column)\n ax.set_title(\"Student Scores over Tests\")\n ax.set_xlabel(\"Test Number\")\n ax.set_ylabel(\"Score\")\n\n return ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n data = [\n {\"John\": 5, \"Jane\": 10, \"Joe\": 7},\n {\"John\": 6, \"Jane\": 8, \"Joe\": 10},\n {\"John\": 5, \"Jane\": 9, \"Joe\": 8},\n {\"John\": 7, \"Jane\": 10, \"Joe\": 9},\n ]\n self.validate_plot(data)\n def test_case_2(self):\n data = [{\"John\": 3}, {\"John\": 4}, {\"John\": 5}, {\"John\": 6}]\n self.validate_plot(data)\n def test_case_3(self):\n data = [\n {\"John\": 3, \"Jane\": 2},\n {\"John\": 4, \"Jane\": 3},\n {\"John\": 5, \"Jane\": 4},\n {\"John\": 6, \"Jane\": 5},\n ]\n self.validate_plot(data)\n def test_case_4(self):\n data = [\n {\"John\": 10, \"Jane\": 20, \"Joe\": 15, \"Jack\": 25},\n {\"John\": 12, \"Jane\": 18, \"Joe\": 14, \"Jack\": 24},\n {\"John\": 11, \"Jane\": 19, \"Joe\": 13, \"Jack\": 23},\n {\"John\": 13, \"Jane\": 21, \"Joe\": 16, \"Jack\": 22},\n ]\n self.validate_plot(data)\n def test_case_5(self):\n data = [\n {\"John\": 7, \"Jane\": 8},\n {\"John\": 8, \"Jane\": 7},\n {\"John\": 7, \"Jane\": 8},\n {\"John\": 8, \"Jane\": 7},\n ]\n self.validate_plot(data)\n def test_case_6(self):\n data = []\n self.validate_plot(data)\n def test_case_7(self):\n # Floats\n data = [{\"John\": 5.5, \"Jane\": 10.1}, {\"John\": 6.75, \"Jane\": 8.25}]\n self.validate_plot(data)\n def test_case_8(self):\n # Missing scores\n data = [{\"John\": 5, \"Jane\": 10}, {\"Jane\": 8, \"Joe\": 7}, {\"John\": 6}]\n self.validate_plot(data)\n def validate_plot(self, data):\n ax = f_409(data)\n self.assertIsInstance(ax, plt.Axes)\n df = pd.DataFrame(data)\n for idx, column in enumerate(df):\n plotted_data_y = ax.lines[idx].get_ydata()\n expected_data_y = df[column].values.astype(float)\n # Handle float comparisons\n np.testing.assert_allclose(\n plotted_data_y, expected_data_y, rtol=1e-5, atol=1e-8, equal_nan=True\n )\n plotted_data_x = ax.lines[idx].get_xdata().astype(int)\n expected_data_x = np.arange(len(df[column].values))\n self.assertTrue(\n np.array_equal(plotted_data_x, expected_data_x),\n msg=f\"X-data Mismatch for {column}. Plotted: {plotted_data_x}, Expected: {expected_data_x}\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Visualizes the scores of students over multiple tests using a line plot.", "The function takes in a list of dictionaries. Each dictionary contains the name of a student (key)", "and their score (value). It combines these dictionaries into a pandas DataFrame and plots a line graph", "of student scores over tests, where the x-axis represents the test number and the y-axis represents the score.", "Each student's scores are plotted as separate lines. Missing scores are handled by not plotting", "those specific data points, allowing for discontinuous lines where data is missing."], "note": [], "params": ["data_list (list of dict): A list of dictionaries with student names as keys and their scores as values."], "returns": ["ax (matplotlib.axes._axes.Axes): The Axes object with the plotted data."], "reqs": ["pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [{'John': 5, 'Jane': 10}, {'John': 6, 'Jane': 8}, {'John': 5, 'Jane': 9}]", ">>> ax = f_409(data)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-0.25, 0, '\u22120.25'), Text(0.0, 0, '0.00'), Text(0.25, 0, '0.25'), Text(0.5, 0, '0.50'), Text(0.75, 0, '0.75'), Text(1.0, 0, '1.00'), Text(1.25, 0, '1.25'), Text(1.5, 0, '1.50'), Text(1.75, 0, '1.75'), Text(2.0, 0, '2.00'), Text(2.25, 0, '2.25')]"]}} +{"task_id": "f_857", "prompt": "import requests\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_857(api_url):\n \"\"\"\n Fetches data from a specified API, processes the JSON response, converts it into a pandas DataFrame,\n and plots the data using matplotlib.\n If the data is empty, no plot is generated. If the API request fails, it raises an HTTPError.\n The function also checks if the provided API URL is a string.\n\n Parameters:\n - api_url (str): The URL of the API to fetch data from.\n\n Returns:\n - DataFrame: A pandas DataFrame with the parsed data from the API.\n - Axes or None: A matplotlib Axes object representing the plot of the data, or None if the data is empty.\n\n Raises:\n - HTTPError: If the API request fails due to issues like network problems, invalid response, etc.\n - TypeError: If the `api_url` is not a string.\n\n Requirements:\n - requests\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> df, plot = f_857(\"https://api.example.com/data\")\n >>> df.head()\n >>> if plot:\n >>> plot.show()\n \"\"\"", "canonical_solution": " # Send the GET request and handle API failure\n if not isinstance(api_url, str):\n raise TypeError(\"api_url must be a string\")\n\n response = requests.get(api_url, timeout=5)\n response.raise_for_status()\n\n # Parse the JSON response and convert it to a pandas DataFrame\n data = response.json()\n df = pd.DataFrame(data)\n\n # Generate a plot if the DataFrame is not empty\n plot = df.plot() if not df.empty else None\n\n return df, plot", "test": "import unittest\nfrom unittest.mock import patch, Mock\nimport pandas as pd\nimport matplotlib.pyplot as plt\nAPI_URL = \"https://api.example.com/data\"\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n @patch(\"requests.get\")\n def test_successful_api_call_with_data(self, mock_get):\n \"\"\"Test the function with a successful API call returning non-empty data.\"\"\"\n mock_get.return_value = Mock(status_code=200, json=lambda: [{\"a\": 1, \"b\": 2}])\n df, plot = f_857(\"http://example.com/api\")\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIsInstance(plot, plt.Axes)\n @patch(\"requests.get\")\n def test_successful_api_call_with_empty_data(self, mock_get):\n \"\"\"Test the function with a successful API call returning empty data.\"\"\"\n mock_get.return_value = Mock(status_code=200, json=lambda: [])\n df, plot = f_857(\"http://example.com/api\")\n self.assertIsInstance(df, pd.DataFrame)\n self.assertTrue(df.empty)\n self.assertIsNone(plot)\n @patch(\"requests.get\")\n def test_api_call_with_invalid_json(self, mock_get):\n \"\"\"Test the function with an API call returning invalid JSON.\"\"\"\n mock_get.return_value = Mock(\n status_code=200, json=lambda: Exception(\"Invalid JSON\")\n )\n with self.assertRaises(Exception):\n f_857(\"http://example.com/api\")\n @patch(\"requests.get\")\n def test_api_call_with_http_error(self, mock_get):\n \"\"\"Test the function with an API call that raises an HTTP error.\"\"\"\n mock_get.side_effect = requests.HTTPError()\n with self.assertRaises(requests.HTTPError):\n f_857(\"http://example.com/api\")\n def test_incorrect_url_type(self):\n \"\"\"Test the function with an incorrect type for the URL.\"\"\"\n with self.assertRaises(TypeError):\n f_857(123)\n def tearDown(self):\n plt.close()", "apis": ["pandas.DataFrame", "requests.get"], "libs": ["pandas", "requests"], "doc": {"description": ["Fetches data from a specified API, processes the JSON response, converts it into a pandas DataFrame,", "and plots the data using matplotlib.", "If the data is empty, no plot is generated. If the API request fails, it raises an HTTPError.", "The function also checks if the provided API URL is a string."], "note": [], "params": ["api_url (str): The URL of the API to fetch data from."], "returns": ["DataFrame: A pandas DataFrame with the parsed data from the API.", "Axes or None: A matplotlib Axes object representing the plot of the data, or None if the data is empty."], "reqs": ["requests", "pandas", "matplotlib.pyplot"], "raises": ["HTTPError: If the API request fails due to issues like network problems, invalid response, etc.", "TypeError: If the `api_url` is not a string."], "example": [">>> df, plot = f_857(\"https://api.example.com/data\")", ">>> df.head()", ">>> if plot:", ">>> plot.show()"]}} +{"task_id": "f_546", "prompt": "from collections import Counter\nfrom itertools import chain\n\ndef f_546(list_of_lists):\n \"\"\"\n Merge all sublists from a list of lists into a list and return a count of the elements.\n \n Parameters:\n - list_of_lists (list): The list to be processed.\n\n Returns:\n - collections.Counter: Counter object with the counts of the elements in the merged list.\n\n Requirements:\n - itertools\n - collections\n \n Example:\n >>> f_546([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1})\n \"\"\"", "canonical_solution": " merged_list = list(chain.from_iterable(list_of_lists))\n return Counter(merged_list)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}))\n def test_case_2(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 2, 2: 2, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}))\n def test_case_3(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2], [1, 2, 3, 4, 5, 6, 7, 8, 9]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 3, 2: 3, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2, 8: 2, 9: 2}))\n def test_case_4(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2], [1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 4, 2: 4, 3: 3, 4: 2, 5: 2, 6: 2, 7: 2, 8: 2, 9: 2}))\n def test_case_5(self):\n list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2], [1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]]\n self.assertEqual(f_546(list_of_lists), Counter({1: 5, 2: 5, 3: 4, 4: 3, 5: 3, 6: 3, 7: 3, 8: 3, 9: 3}))", "apis": ["collections.Counter", "itertools.chain.from_iterable"], "libs": ["itertools", "collections"], "doc": {"description": ["Merge all sublists from a list of lists into a list and return a count of the elements."], "note": [], "params": ["list_of_lists (list): The list to be processed."], "returns": ["collections.Counter: Counter object with the counts of the elements in the merged list."], "reqs": ["itertools", "collections"], "raises": [], "example": [">>> f_546([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", "Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1})"]}} +{"task_id": "f_767", "prompt": "import pandas as pd\nimport numpy as np\n\ndef f_767(data_size=1000, column_names=['A', 'B', 'C', 'D', 'E'], seed=0):\n \"\"\"\n Generate a Pandas DataFrame with random numeric values between 1 and 100, inclusive, and replace all occurrences of values less than 10 with -1.\n \n Requirements:\n - pandas\n - numpy\n \n Parameters:\n - data_size (int, optional): The number of rows in the DataFrame. Defaults to 1000.\n - column_names (list of str, optional): Names of the DataFrame columns. Defaults to ['A', 'B', 'C', 'D', 'E'].\n\n Returns:\n - DataFrame: The modified Pandas DataFrame.\n \"\"\"", "canonical_solution": " np.random.seed(seed)\n df = pd.DataFrame(np.random.randint(1, 101, size=(data_size, len(column_names))), columns=column_names)\n df[df < 10] = -1 # Correctly replace values less than 10 with -1\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_default_parameters(self):\n df = f_767(seed=42)\n self.assertEqual(df.shape, (1000, 5))\n # Check that there are no values < 10 except -1\n condition = ((df >= 10) | (df == -1)).all().all()\n self.assertTrue(condition, \"DataFrame contains values less than 10 that were not replaced with -1\")\n def test_custom_data_size_and_columns(self):\n df = f_767(data_size=10, column_names=['X', 'Y'], seed=55)\n self.assertEqual(df.shape, (10, 2))\n # Check that there are no values < 10 except -1\n condition = ((df >= 10) | (df == -1)).all().all()\n self.assertTrue(condition, \"DataFrame contains values less than 10 that were not replaced with -1\")\n def test_correct_replacement_of_values(self):\n df = f_767(data_size=100, seed=0)\n self.assertTrue(((df >= 10) | (df == -1)).all().all(), \"Not all values less than 10 were replaced with -1\")\n \n def test_correct_dataframe_dimensions(self):\n rows, columns = 50, 3\n df = f_767(data_size=rows, column_names=['P', 'Q', 'R'], seed=1)\n self.assertEqual(df.shape, (rows, columns), \"DataFrame dimensions are incorrect\")\n \n def test_with_minimum_data_size(self):\n df = f_767(data_size=1, column_names=['Single'], seed=2)\n self.assertEqual(df.shape, (1, 1), \"DataFrame does not handle minimum data size correctly\")", "apis": ["pandas.DataFrame", "numpy.random", "numpy.random.randint", "numpy.random.seed"], "libs": ["numpy", "pandas"], "doc": {"description": ["Generate a Pandas DataFrame with random numeric values between 1 and 100, inclusive, and replace all occurrences of values less than 10 with -1."], "note": [], "params": ["data_size (int, optional): The number of rows in the DataFrame. Defaults to 1000.", "column_names (list of str, optional): Names of the DataFrame columns. Defaults to ['A', 'B', 'C', 'D', 'E']."], "returns": ["DataFrame: The modified Pandas DataFrame."], "reqs": ["pandas", "numpy"], "raises": [], "example": []}} +{"task_id": "f_375", "prompt": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_375(N=100, CATEGORIES=[\"A\", \"B\", \"C\", \"D\", \"E\"], seed=42):\n \"\"\"\n Create a DataFrame with a given number of rows (N) and 3 columns: \"x\" and \"y\" with random values,\n and \"category\" with random categories from a given CATEGORIES list. Each category is guaranteed to\n appear at least once if N is greater than or equal to the number of categories, otherwise it is\n randomly sampled without replacement from CATEGORIES. Finally, draw a scatter plot of \"x\" vs \"y,\"\n colored by \"category\".\n\n Parameters:\n - N (int, optional): Number of rows for the DataFrame. Defaults to 100.\n - CATEGORIES (list, optional): List of categories. Defaults to ['A', 'B', 'C', 'D', 'E'].\n - seed (int, optional): Random seed for reproducibility. Defaults to 42.\n\n Returns:\n tuple: A tuple containing:\n - DataFrame: The generated DataFrame.\n - Axes: The Axes object of the scatter plot.\n\n Requirements:\n - numpy\n - pandas\n - matplotlib.pyplot\n\n Example:\n >>> df, ax = f_375()\n >>> df.head()\n x y category\n 0 0.239562 0.385098 C\n 1 0.144895 0.851137 D\n 2 0.489453 0.316922 C\n 3 0.985650 0.169493 E\n 4 0.242055 0.556801 A\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " np.random.seed(seed)\n\n if N < len(CATEGORIES):\n all_categories = np.random.choice(CATEGORIES, N, replace=False)\n else:\n guaranteed_categories = np.array(CATEGORIES)\n remaining_categories = np.random.choice(CATEGORIES, N - len(CATEGORIES))\n all_categories = np.concatenate([guaranteed_categories, remaining_categories])\n np.random.shuffle(all_categories)\n\n df = pd.DataFrame(\n {\"x\": np.random.rand(N), \"y\": np.random.rand(N), \"category\": all_categories}\n )\n\n fig, ax = plt.subplots()\n for category in CATEGORIES:\n ax.scatter(\n df[df[\"category\"] == category][\"x\"],\n df[df[\"category\"] == category][\"y\"],\n label=category,\n )\n\n return df, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test default parameter\n df, ax = f_375()\n self.assertEqual(df.shape, (100, 3))\n self.assertSetEqual(set(df[\"category\"]), {\"A\", \"B\", \"C\", \"D\", \"E\"})\n self.assertListEqual(list(df.columns), [\"x\", \"y\", \"category\"])\n self.assertTrue(df[\"x\"].between(0, 1).all())\n self.assertTrue(df[\"y\"].between(0, 1).all())\n self.assertIsInstance(ax, plt.Axes)\n def test_case_2(self):\n # Test custom parameters\n df, ax = f_375(N=50, CATEGORIES=[\"X\", \"Y\"])\n self.assertEqual(df.shape, (50, 3))\n self.assertSetEqual(set(df[\"category\"]), {\"X\", \"Y\"})\n self.assertListEqual(list(df.columns), [\"x\", \"y\", \"category\"])\n self.assertTrue(df[\"x\"].between(0, 1).all())\n self.assertTrue(df[\"y\"].between(0, 1).all())\n self.assertIsInstance(ax, plt.Axes)\n def test_case_3(self):\n # Test N specifically\n for N in [5, 10, 50, 200]:\n df, _ = f_375(N=N)\n self.assertEqual(df.shape, (N, 3))\n def test_case_4(self):\n # Test categories specifically\n for C in [[\"APPLE\", \"BANANA\"], [\"carrot\", \"dragonfruit\", \"eggplant\"], [\"F\"]]:\n df, _ = f_375(CATEGORIES=C)\n self.assertSetEqual(set(df[\"category\"]), set(C))\n def test_case_5(self):\n # Test random seed\n df1, _ = f_375(seed=0)\n df2, _ = f_375(seed=0)\n df3, _ = f_375(seed=1)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertFalse(df1.equals(df3))\n def test_case_6(self):\n # Test handling empty dataframe\n df, _ = f_375(N=0, CATEGORIES=[])\n self.assertEqual(df.shape, (0, 3))\n self.assertListEqual(list(df[\"category\"]), [])\n def test_case_7(self):\n # Test handing more categories than data points\n df, _ = f_375(N=3, CATEGORIES=[\"A\", \"B\", \"C\", \"D\"])\n self.assertEqual(len(df), 3)\n self.assertEqual(len(set(df[\"category\"])), 3)\n def test_case_8(self):\n # Test single category\n df, _ = f_375(N=50, CATEGORIES=[\"X\"])\n self.assertTrue((df[\"category\"] == \"X\").all())\n def test_case_9(self):\n # Test other category types\n df, _ = f_375(N=50, CATEGORIES=[1, 2, 3])\n self.assertSetEqual(set(df[\"category\"]), {1, 2, 3})\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.random.shuffle", "pandas.DataFrame", "numpy.random", "numpy.array", "numpy.random.rand", "matplotlib.pyplot.subplots", "numpy.concatenate", "numpy.random.choice", "numpy.random.seed"], "libs": ["pandas", "numpy", "matplotlib"], "doc": {"description": ["Create a DataFrame with a given number of rows (N) and 3 columns: \"x\" and \"y\" with random values,", "and \"category\" with random categories from a given CATEGORIES list. Each category is guaranteed to", "appear at least once if N is greater than or equal to the number of categories, otherwise it is", "randomly sampled without replacement from CATEGORIES. Finally, draw a scatter plot of \"x\" vs \"y,\"", "colored by \"category\"."], "note": [], "params": ["N (int, optional): Number of rows for the DataFrame. Defaults to 100.", "CATEGORIES (list, optional): List of categories. Defaults to ['A', 'B', 'C', 'D', 'E'].", "seed (int, optional): Random seed for reproducibility. Defaults to 42."], "returns": ["tuple: A tuple containing:", "DataFrame: The generated DataFrame.", "Axes: The Axes object of the scatter plot."], "reqs": ["numpy", "pandas", "matplotlib.pyplot"], "raises": [], "example": [">>> df, ax = f_375()", ">>> df.head()", "x y category", "0 0.239562 0.385098 C", "1 0.144895 0.851137 D", "2 0.489453 0.316922 C", "3 0.985650 0.169493 E", "4 0.242055 0.556801 A", ">>> type(ax)", ""]}} +{"task_id": "f_747", "prompt": "import os\nimport glob\nimport csv\n\ndef f_747(directory_path, file_extension='.csv'):\n \"\"\"\n Reads all files with a specified extension in a given directory and returns their data in a dictionary.\n\n Functionality:\n - Reads all files with the specified extension in the given directory.\n - Uses the filename without the extension as a key in the output dictionary.\n - The value for each key is a list of rows from the file, where each row is represented as a list of values.\n\n Input:\n - directory_path (str): The path to the directory containing the files.\n - file_extension (str, optional): The file extension to look for. Default is '.csv'.\n\n Output:\n - Returns a dictionary where each key is the filename (without extension) and the value is a list of rows from the file.\n\n Requirements:\n - os\n - glob\n - csv\n\n Example:\n >>> data = f_747('/home/user/data')\n >>> print(data['file1'])\n [['header1', 'header2'], ['row1_col1', 'row1_col2'], ['row2_col1', 'row2_col2']]\n \n >>> data = f_747('/home/user/data', '.txt')\n >>> print(data)\n {}\n \"\"\"", "canonical_solution": " data = {}\n\n for file in glob.glob(os.path.join(directory_path, '*' + file_extension)):\n filename = os.path.splitext(os.path.basename(file))[0]\n with open(file, 'r') as f:\n reader = csv.reader(f)\n data[filename] = list(reader)\n\n return data", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # create a directory with test files\n os.mkdir('test_1')\n with open('test_1/file1.csv', 'w', newline='') as f:\n writer = csv.writer(f)\n writer.writerows([['header1', 'header2'], ['row1_col1', 'row1_col2'], ['row2_col1', 'row2_col2']])\n os.mkdir('test_2')\n with open('test_2/file2.csv', 'w', newline='') as f:\n writer = csv.writer(f)\n writer.writerows([['name', 'age'], ['Alice', '30'], ['Bob', '40']])\n os.mkdir('test_5')\n with open('test_5/file3.csv', 'w', newline='') as f:\n writer = csv.writer(f)\n writer.writerows([['subject', 'marks'], ['Math', '90'], ['Science', '85']])\n def tearDown(self):\n # remove the test directories\n shutil.rmtree('test_1')\n shutil.rmtree('test_2')\n shutil.rmtree('test_5')\n \n def test_case_1(self):\n # This test assumes the existence of a directory named 'f_747_data_wenhao' with a CSV file 'file1.csv'\n data = f_747('test_1')\n self.assertIsInstance(data, dict)\n self.assertIn('file1', data)\n self.assertEqual(data['file1'], [['header1', 'header2'], ['row1_col1', 'row1_col2'], ['row2_col1', 'row2_col2']])\n def test_case_2(self):\n # This test checks explicit file_extension input\n data = f_747('test_2', '.csv')\n self.assertIsInstance(data, dict)\n self.assertIn('file2', data)\n self.assertEqual(data['file2'], [['name', 'age'], ['Alice', '30'], ['Bob', '40']])\n def test_case_3(self):\n # This test checks for a non-existent file extension, expecting an empty dictionary\n data = f_747('test_3', '.txt')\n self.assertIsInstance(data, dict)\n self.assertEqual(len(data), 0)\n def test_case_4(self):\n # This test checks for a non-existent directory, expecting an empty dictionary\n data = f_747('/nonexistent/directory')\n self.assertIsInstance(data, dict)\n self.assertEqual(len(data), 0)\n def test_case_5(self):\n # This test checks another file's presence and content in the dictionary\n data = f_747('test_5')\n self.assertIsInstance(data, dict)\n self.assertIn('file3', data)\n self.assertEqual(data['file3'], [['subject', 'marks'], ['Math', '90'], ['Science', '85']])", "apis": ["os.path.splitext", "os.path", "csv.reader", "os.path.join", "os.path.basename", "glob.glob"], "libs": ["os", "glob", "csv"], "doc": {"description": ["Reads all files with a specified extension in a given directory and returns their data in a dictionary.", "Functionality:", "- Reads all files with the specified extension in the given directory.", "- Uses the filename without the extension as a key in the output dictionary.", "- The value for each key is a list of rows from the file, where each row is represented as a list of values.", "Input:", "- directory_path (str): The path to the directory containing the files.", "- file_extension (str, optional): The file extension to look for. Default is '.csv'.", "Output:", "- Returns a dictionary where each key is the filename (without extension) and the value is a list of rows from the file.", ">>> data = f_747('/home/user/data', '.txt')", ">>> print(data)", "{}"], "note": [], "params": [], "returns": [], "reqs": ["os", "glob", "csv"], "raises": [], "example": [">>> data = f_747('/home/user/data')", ">>> print(data['file1'])", "[['header1', 'header2'], ['row1_col1', 'row1_col2'], ['row2_col1', 'row2_col2']]"]}} {"task_id": "f_357", "prompt": "import numpy as np\nfrom sklearn import datasets\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\n\n\ndef f_357(n_samples=100, n_features=10, random_seed=None):\n \"\"\"\n Generate synthetic data using a simple regression model, fit a linear regression model to the data,\n and return the predicted values along with the coefficients and intercept of the model.\n\n Parameters:\n - n_samples (int): The number of samples for the synthetic data. Default is 100.\n - n_features (int): The number of features for the synthetic data. Default is 10.\n - random_seed (int, optional): The seed for reproducibility. Default is None.\n\n Returns:\n - tuple: A tuple containing:\n - predictions (numpy.ndarray): The predicted values of the test set.\n - coefficients (numpy.ndarray): Coefficients of the linear regression model.\n - intercept (float): Intercept of the linear regression model.\n - mse (float): Mean squared error of the model predictions.\n\n Requirements:\n - numpy\n - sklearn.datasets.make_regression\n - sklearn.model_selection.train_test_split\n - sklearn.linear_model.LinearRegression\n \n Example:\n >>> predictions, coefficients, intercept, mse = f_357(100, 5, random_seed=42)\n >>> predictions[:3]\n array([ 180.79207843, -295.0210232 , 118.23799221])\n >>> round(mse, 4)\n 0.0113\n \"\"\"", "canonical_solution": " # Generate synthetic data\n X, y = datasets.make_regression(\n n_samples=n_samples, n_features=n_features, noise=0.1, random_state=random_seed\n )\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=random_seed\n )\n\n # Fit a linear regression model\n model = LinearRegression()\n model.fit(X_train, y_train)\n\n # Make predictions on the test set\n predictions = model.predict(X_test)\n coefficients = model.coef_\n intercept = model.intercept_\n\n mse = np.mean((predictions - y_test) ** 2)\n return predictions, coefficients, intercept, mse", "test": "import unittest\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import datasets\nfrom numpy.testing import assert_array_equal\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def generate_data(self, n_samples, n_features, random_seed=None):\n # Generate data for testing\n X, y = datasets.make_regression(\n n_samples=n_samples,\n n_features=n_features,\n noise=0.1,\n random_state=random_seed,\n )\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=random_seed\n )\n return X_train, X_test, y_train, y_test\n def test_case_1(self):\n # Basic test for different inputs\n random_seed = 1\n for n_samples, n_features in [\n [100, 5],\n [500, 8],\n [1000, 10],\n [5000, 15],\n [10000, 20],\n ]:\n predictions, _, _, mse = f_357(n_samples, n_features, random_seed=random_seed)\n _, _, _, y = self.generate_data(\n n_samples, n_features, random_seed=random_seed\n )\n self.assertEqual(mse, mean_squared_error(y, predictions))\n def test_case_2(self):\n # Test default parameters\n predictions, coefficients, intercept, mse = f_357(random_seed=42)\n self.assertEqual(\n predictions.shape[0], 20\n ) # Default split leaves 20% of 100 samples for testing\n self.assertEqual(coefficients.shape[0], 10) # Default number of features\n self.assertIsInstance(intercept, float)\n _, _, _, y = self.generate_data(\n 100, 10, 42\n )\n self.assertEqual(mse, mean_squared_error(y, predictions))\n def test_case_3(self):\n # Test different random seeds for reproducibility\n _, coefficients_1, intercept_1, mse_1 = f_357(random_seed=1)\n _, coefficients_2, intercept_2, mse_2 = f_357(random_seed=2)\n with self.assertRaises(AssertionError):\n assert_array_equal(coefficients_1, coefficients_2)\n self.assertEqual(intercept_1, intercept_2)\n \n def test_case_4(self):\n # Test zero and negative samples and features\n with self.assertRaises(ValueError):\n f_357(n_samples=0, n_features=10)\n with self.assertRaises(ValueError):\n f_357(n_samples=100, n_features=0)\n with self.assertRaises(ValueError):\n f_357(n_samples=-100, n_features=10)\n with self.assertRaises(ValueError):\n f_357(n_samples=100, n_features=-10)\n def test_case_5(self):\n # Test extreme values for parameters\n predictions, _, _, mse = f_357(n_samples=100000, n_features=100, random_seed=42)\n self.assertEqual(\n predictions.shape[0], 20000\n ) # 20% of 100000 samples for testing\n self.assertAlmostEqual(mse, 0.010142327812255192, places=4)\n \n def test_case_6(self):\n # Test output shapes\n predictions, coefficients, _, mse = f_357(\n n_samples=100, n_features=5, random_seed=42\n )\n self.assertEqual(predictions.shape[0], 20)\n self.assertEqual(coefficients.shape[0], 5)\n def test_case_7(self):\n # Test output types\n predictions, coefficients, intercept, mse = f_357()\n self.assertIsInstance(predictions, np.ndarray)\n self.assertIsInstance(coefficients, np.ndarray)\n self.assertIsInstance(intercept, float)\n self.assertIsInstance(mse, float)\n \n def test_case_8(self):\n # Test determinism with the same random seed\n predictions_1, _, _, mse_1 = f_357(random_seed=42)\n predictions_2, _, _, mse_2 = f_357(random_seed=42)\n assert_array_equal(predictions_1, predictions_2)\n self.assertEqual(mse_1, mse_2)\n \n def test_case_9(self):\n # Test without random seed (non-deterministic outcomes)\n predictions_1, _, _, _ = f_357()\n predictions_2, _, _, _ = f_357()\n with self.assertRaises(AssertionError):\n assert_array_equal(predictions_1, predictions_2)", "apis": ["sklearn.model_selection.train_test_split", "sklearn.datasets.make_regression", "sklearn.linear_model.LinearRegression", "numpy.mean"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Generate synthetic data using a simple regression model, fit a linear regression model to the data,", "and return the predicted values along with the coefficients and intercept of the model."], "note": [], "params": ["n_samples (int): The number of samples for the synthetic data. Default is 100.", "n_features (int): The number of features for the synthetic data. Default is 10.", "random_seed (int, optional): The seed for reproducibility. Default is None."], "returns": ["tuple: A tuple containing:", "predictions (numpy.ndarray): The predicted values of the test set.", "coefficients (numpy.ndarray): Coefficients of the linear regression model.", "intercept (float): Intercept of the linear regression model.", "mse (float): Mean squared error of the model predictions."], "reqs": ["numpy", "sklearn.datasets.make_regression", "sklearn.model_selection.train_test_split", "sklearn.linear_model.LinearRegression"], "raises": [], "example": [">>> predictions, coefficients, intercept, mse = f_357(100, 5, random_seed=42)", ">>> predictions[:3]", "array([ 180.79207843, -295.0210232 , 118.23799221])", ">>> round(mse, 4)", "0.0113"]}} -{"task_id": "f_768", "prompt": "import pandas as pd\nimport sqlite3\nimport os\n\ndef f_768(db_path: str, table_name: str, column_name: str) -> pd.DataFrame:\n \"\"\"\n Loads data from an SQLite database into a Pandas DataFrame and performs a string replacement operation\n on a specified column. Specifically, replaces all occurrences of the newline character '\\n' with the HTML line\n break tag '
'.\n \n Requirements:\n - pandas\n - sqlite3\n - os\n \n Parameters:\n - db_path (str): The path to the SQLite database file.\n - table_name (str): The name of the table from which to load data.\n - column_name (str): The name of the column in which to perform string replacement.\n \n Returns:\n pd.DataFrame: The modified DataFrame with replaced strings in the specified column.\n\n Examples:\n >>> df = f_768('./data.db', 'messages', 'content')\n >>> df.loc[0, 'content'] # Assuming the first row originally contained \"Hello\\nWorld\"\n 'Hello
World'\n >>> df = f_768('./another_data.db', 'comments', 'text')\n >>> df.loc[1, 'text'] # Assuming the second row originally contained \"Good\\nMorning\"\n 'Good
Morning'\n \"\"\"", "canonical_solution": " try:\n conn = sqlite3.connect(db_path)\n df = pd.read_sql_query(f\"SELECT * FROM {table_name}\", conn)\n df[column_name] = df[column_name].replace({'\\n': '
'}, regex=True)\n finally:\n conn.close()\n return df", "test": "def create_mock_db(db_path: str, table_name: str, column_name: str):\n conn = sqlite3.connect(db_path)\n cursor = conn.cursor()\n cursor.execute(f\"CREATE TABLE {table_name} ({column_name} TEXT)\")\n cursor.executemany(f\"INSERT INTO {table_name} ({column_name}) VALUES (?)\", [(\"Hello\\nWorld\",), (\"Good\\nMorning\",), (\"Welcome\\nBack\",)])\n conn.commit()\n conn.close()\nimport unittest\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.db1_path = 'test_db1.db'\n cls.db2_path = 'test_db2.db'\n cls.table_name1 = 'TestData1'\n cls.table_name2 = 'TestData2'\n cls.column_name1 = 'TextColumn1'\n cls.column_name2 = 'TextColumn2'\n create_mock_db(cls.db1_path, cls.table_name1, cls.column_name1)\n create_mock_db(cls.db2_path, cls.table_name2, cls.column_name2)\n @classmethod\n def tearDownClass(cls):\n os.remove(cls.db1_path)\n os.remove(cls.db2_path)\n os.remove('nonexistent.db')\n \n def test_valid_input(self):\n df1 = f_768(self.db1_path, self.table_name1, self.column_name1)\n self.assertIn('
', df1[self.column_name1].iloc[0])\n def test_different_table_and_column(self):\n df2 = f_768(self.db2_path, self.table_name2, self.column_name2)\n self.assertIn('
', df2[self.column_name2].iloc[1])\n def test_invalid_db_path(self):\n # Adjusting for the fact that a non-existent database doesn't cause sqlite3.OperationalError when using pandas\n try:\n f_768('nonexistent.db', self.table_name1, self.column_name1)\n self.fail(\"Expected an exception due to nonexistent database path\")\n except Exception as e:\n self.assertIsInstance(e, (sqlite3.OperationalError, pd.errors.DatabaseError))\n def test_invalid_table_name(self):\n with self.assertRaises(pd.errors.DatabaseError):\n f_768(self.db1_path, 'NonexistentTable', self.column_name1)\n def test_invalid_column_name(self):\n # This checks for a KeyError since pandas will raise this if the column does not exist\n with self.assertRaises(KeyError):\n f_768(self.db1_path, self.table_name1, 'NonexistentColumn')", "apis": ["pandas.DataFrame", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["sqlite3", "pandas"], "doc": {"description": ["Loads data from an SQLite database into a Pandas DataFrame and performs a string replacement operation", "on a specified column. Specifically, replaces all occurrences of the newline character '\\n' with the HTML line", "break tag '
'."], "note": [], "params": ["db_path (str): The path to the SQLite database file.", "table_name (str): The name of the table from which to load data.", "column_name (str): The name of the column in which to perform string replacement."], "returns": ["pd.DataFrame: The modified DataFrame with replaced strings in the specified column."], "reqs": ["pandas", "sqlite3", "os"], "raises": [], "example": ["Examples:", ">>> df = f_768('./data.db', 'messages', 'content')", ">>> df.loc[0, 'content'] # Assuming the first row originally contained \"Hello\\nWorld\"", "'Hello
World'", ">>> df = f_768('./another_data.db', 'comments', 'text')", ">>> df.loc[1, 'text'] # Assuming the second row originally contained \"Good\\nMorning\"", "'Good
Morning'"]}} -{"task_id": "f_358", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.preprocessing import MinMaxScaler\n\n\ndef f_358(data: pd.DataFrame) -> (pd.DataFrame, plt.Axes):\n \"\"\"\n Normalize the data and visualize it using a heatmap.\n\n This function takes a pandas DataFrame, normalizes the data to a range [0, 1], and then visualizes this\n normalized data using a seaborn heatmap. The heatmap uses the \"YlGnBu\" colormap to represent normalized\n values and includes a color bar labeled \"Normalized Value\" to indicate the range of data values.\n It returns both the normalized data and the heatmap plot.\n\n Parameters:\n - data (pd.DataFrame): The input data with multiple features in columns.\n\n Returns:\n - pd.DataFrame: Normalized data.\n - plt.Axes: Heatmap plot of the normalized data.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n - seaborn\n \n Example:\n >>> df = pd.DataFrame([[1,1,1], [2,2,2], [3,3,3]], columns=['Feature1', 'Feature2', 'Feature3'])\n >>> normalized_df, _ = f_358(df)\n >>> type(normalized_df)\n \n >>> normalized_df['Feature1'].iloc[0] # Returns a normalized value between 0 and 1\n 0.0\n \"\"\"", "canonical_solution": " # Normalizing the data\n scaler = MinMaxScaler()\n normalized_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)\n\n # Plotting heatmap\n plt.figure(figsize=(10, 8))\n ax = sns.heatmap(\n normalized_data, cmap=\"YlGnBu\", cbar_kws={\"label\": \"Normalized Value\"}\n )\n\n return normalized_data, ax", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n # default columns used for testing, but function is not limited to these options\n self.expected_columns = [\n \"Feature1\",\n \"Feature2\",\n \"Feature3\",\n \"Feature4\",\n \"Feature5\",\n ]\n def _check_data_structure(self, data, expected_columns):\n self.assertIsInstance(data, pd.DataFrame)\n for col in data.columns:\n self.assertIn(col, expected_columns)\n def _check_data_value(self, data):\n # Check if values in normalized data are between 0 and 1\n # (allowing a small margin for precision issues)\n self.assertTrue(((data.values >= -1e-10) & (data.values <= 1.00000001)).all())\n def _check_heatmap(self, ax):\n # Test visualization\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.collections), 1) # 1 heatmap\n cbar = ax.collections[0].colorbar\n self.assertTrue(cbar is not None)\n self.assertTrue(cbar.ax.get_ylabel(), \"Normalized Value\")\n self.assertEqual(ax.collections[0].cmap.name, \"YlGnBu\")\n def test_case_1(self):\n # Test with random data\n data = pd.DataFrame(\n np.random.rand(100, 5),\n columns=self.expected_columns,\n )\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, self.expected_columns)\n self._check_data_value(normalized_data)\n self._check_heatmap(ax)\n def test_case_2(self):\n # Test with data having all zeros\n data = pd.DataFrame(\n np.zeros((100, 5)),\n columns=self.expected_columns,\n )\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, self.expected_columns)\n self._check_heatmap(ax)\n # Check if all values in normalized data are zero\n self.assertTrue((normalized_data.values == 0).all())\n def test_case_3(self):\n # Test with data having incremental values\n data = pd.DataFrame(\n np.arange(500).reshape(100, 5),\n columns=self.expected_columns,\n )\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, self.expected_columns)\n self._check_data_value(normalized_data)\n self._check_heatmap(ax)\n def test_case_4(self):\n # Test with data having decremental values\n data = pd.DataFrame(\n np.arange(500, 0, -1).reshape(100, 5),\n columns=self.expected_columns,\n )\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, self.expected_columns)\n self._check_data_value(normalized_data)\n self._check_heatmap(ax)\n def test_case_5(self):\n # Test single valid column\n data = pd.DataFrame(np.random.rand(100, 1), columns=[\"Feature1\"])\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, [\"Feature1\"])\n self._check_data_value(normalized_data)\n self._check_heatmap(ax)\n def test_case_6(self):\n # Test should fail when inputs are invalid - string column\n data = pd.DataFrame(\n {\"Feature1\": np.random.rand(100), \"Feature2\": [\"string\"] * 100}\n )\n with self.assertRaises(ValueError):\n f_358(data)\n def test_case_7(self):\n # Test should fail when inputs are invalid - empty dataframe\n data = pd.DataFrame()\n with self.assertRaises(ValueError):\n f_358(data)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["sklearn.preprocessing.MinMaxScaler", "matplotlib.pyplot.Axes", "seaborn.heatmap", "matplotlib.pyplot.figure", "pandas.DataFrame"], "libs": ["matplotlib", "pandas", "sklearn", "seaborn"], "doc": {"description": ["Normalize the data and visualize it using a heatmap.", "This function takes a pandas DataFrame, normalizes the data to a range [0, 1], and then visualizes this", "normalized data using a seaborn heatmap. The heatmap uses the \"YlGnBu\" colormap to represent normalized", "values and includes a color bar labeled \"Normalized Value\" to indicate the range of data values.", "It returns both the normalized data and the heatmap plot."], "note": [], "params": ["data (pd.DataFrame): The input data with multiple features in columns."], "returns": ["pd.DataFrame: Normalized data.", "plt.Axes: Heatmap plot of the normalized data."], "reqs": ["pandas", "numpy", "matplotlib.pyplot", "seaborn"], "raises": [], "example": [">>> df = pd.DataFrame([[1,1,1], [2,2,2], [3,3,3]], columns=['Feature1', 'Feature2', 'Feature3'])", ">>> normalized_df, _ = f_358(df)", ">>> type(normalized_df)", "", ">>> normalized_df['Feature1'].iloc[0] # Returns a normalized value between 0 and 1", "0.0"]}} +{"task_id": "f_768", "prompt": "import pandas as pd\nimport sqlite3\nimport os\n\ndef f_768(db_path: str, table_name: str, column_name: str) -> pd.DataFrame:\n \"\"\"\n Loads data from an SQLite database into a Pandas DataFrame and performs a string replacement operation\n on a specified column. Specifically, replaces all occurrences of the newline character '\\n' with the HTML line\n break tag '
'.\n \n Requirements:\n - pandas\n - sqlite3\n - os\n \n Parameters:\n - db_path (str): The path to the SQLite database file.\n - table_name (str): The name of the table from which to load data.\n - column_name (str): The name of the column in which to perform string replacement.\n \n Returns:\n pd.DataFrame: The modified DataFrame with replaced strings in the specified column.\n\n Examples:\n >>> df = f_768('./data.db', 'messages', 'content')\n >>> df.loc[0, 'content'] # Assuming the first row originally contained \"Hello\\nWorld\"\n 'Hello
World'\n >>> df = f_768('./another_data.db', 'comments', 'text')\n >>> df.loc[1, 'text'] # Assuming the second row originally contained \"Good\\nMorning\"\n 'Good
Morning'\n \"\"\"", "canonical_solution": " try:\n conn = sqlite3.connect(db_path)\n df = pd.read_sql_query(f\"SELECT * FROM {table_name}\", conn)\n df[column_name] = df[column_name].replace({'\\n': '
'}, regex=True)\n finally:\n conn.close()\n return df", "test": "def create_mock_db(db_path: str, table_name: str, column_name: str):\n conn = sqlite3.connect(db_path)\n cursor = conn.cursor()\n cursor.execute(f\"CREATE TABLE {table_name} ({column_name} TEXT)\")\n cursor.executemany(f\"INSERT INTO {table_name} ({column_name}) VALUES (?)\", [(\"Hello\\nWorld\",), (\"Good\\nMorning\",), (\"Welcome\\nBack\",)])\n conn.commit()\n conn.close()\nimport unittest\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n cls.db1_path = 'test_db1.db'\n cls.db2_path = 'test_db2.db'\n cls.table_name1 = 'TestData1'\n cls.table_name2 = 'TestData2'\n cls.column_name1 = 'TextColumn1'\n cls.column_name2 = 'TextColumn2'\n create_mock_db(cls.db1_path, cls.table_name1, cls.column_name1)\n create_mock_db(cls.db2_path, cls.table_name2, cls.column_name2)\n @classmethod\n def tearDownClass(cls):\n os.remove(cls.db1_path)\n os.remove(cls.db2_path)\n os.remove('nonexistent.db')\n \n def test_valid_input(self):\n df1 = f_768(self.db1_path, self.table_name1, self.column_name1)\n self.assertIn('
', df1[self.column_name1].iloc[0])\n def test_different_table_and_column(self):\n df2 = f_768(self.db2_path, self.table_name2, self.column_name2)\n self.assertIn('
', df2[self.column_name2].iloc[1])\n def test_invalid_db_path(self):\n # Adjusting for the fact that a non-existent database doesn't cause sqlite3.OperationalError when using pandas\n try:\n f_768('nonexistent.db', self.table_name1, self.column_name1)\n self.fail(\"Expected an exception due to nonexistent database path\")\n except Exception as e:\n self.assertIsInstance(e, (sqlite3.OperationalError, pd.errors.DatabaseError))\n def test_invalid_table_name(self):\n with self.assertRaises(pd.errors.DatabaseError):\n f_768(self.db1_path, 'NonexistentTable', self.column_name1)\n def test_invalid_column_name(self):\n # This checks for a KeyError since pandas will raise this if the column does not exist\n with self.assertRaises(KeyError):\n f_768(self.db1_path, self.table_name1, 'NonexistentColumn')", "apis": ["pandas.DataFrame", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["pandas", "sqlite3"], "doc": {"description": ["Loads data from an SQLite database into a Pandas DataFrame and performs a string replacement operation", "on a specified column. Specifically, replaces all occurrences of the newline character '\\n' with the HTML line", "break tag '
'."], "note": [], "params": ["db_path (str): The path to the SQLite database file.", "table_name (str): The name of the table from which to load data.", "column_name (str): The name of the column in which to perform string replacement."], "returns": ["pd.DataFrame: The modified DataFrame with replaced strings in the specified column."], "reqs": ["pandas", "sqlite3", "os"], "raises": [], "example": ["Examples:", ">>> df = f_768('./data.db', 'messages', 'content')", ">>> df.loc[0, 'content'] # Assuming the first row originally contained \"Hello\\nWorld\"", "'Hello
World'", ">>> df = f_768('./another_data.db', 'comments', 'text')", ">>> df.loc[1, 'text'] # Assuming the second row originally contained \"Good\\nMorning\"", "'Good
Morning'"]}} +{"task_id": "f_358", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.preprocessing import MinMaxScaler\n\n\ndef f_358(data: pd.DataFrame) -> (pd.DataFrame, plt.Axes):\n \"\"\"\n Normalize the data and visualize it using a heatmap.\n\n This function takes a pandas DataFrame, normalizes the data to a range [0, 1], and then visualizes this\n normalized data using a seaborn heatmap. The heatmap uses the \"YlGnBu\" colormap to represent normalized\n values and includes a color bar labeled \"Normalized Value\" to indicate the range of data values.\n It returns both the normalized data and the heatmap plot.\n\n Parameters:\n - data (pd.DataFrame): The input data with multiple features in columns.\n\n Returns:\n - pd.DataFrame: Normalized data.\n - plt.Axes: Heatmap plot of the normalized data.\n\n Requirements:\n - pandas\n - numpy\n - matplotlib.pyplot\n - seaborn\n \n Example:\n >>> df = pd.DataFrame([[1,1,1], [2,2,2], [3,3,3]], columns=['Feature1', 'Feature2', 'Feature3'])\n >>> normalized_df, _ = f_358(df)\n >>> type(normalized_df)\n \n >>> normalized_df['Feature1'].iloc[0] # Returns a normalized value between 0 and 1\n 0.0\n \"\"\"", "canonical_solution": " # Normalizing the data\n scaler = MinMaxScaler()\n normalized_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)\n\n # Plotting heatmap\n plt.figure(figsize=(10, 8))\n ax = sns.heatmap(\n normalized_data, cmap=\"YlGnBu\", cbar_kws={\"label\": \"Normalized Value\"}\n )\n\n return normalized_data, ax", "test": "import unittest\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n # default columns used for testing, but function is not limited to these options\n self.expected_columns = [\n \"Feature1\",\n \"Feature2\",\n \"Feature3\",\n \"Feature4\",\n \"Feature5\",\n ]\n def _check_data_structure(self, data, expected_columns):\n self.assertIsInstance(data, pd.DataFrame)\n for col in data.columns:\n self.assertIn(col, expected_columns)\n def _check_data_value(self, data):\n # Check if values in normalized data are between 0 and 1\n # (allowing a small margin for precision issues)\n self.assertTrue(((data.values >= -1e-10) & (data.values <= 1.00000001)).all())\n def _check_heatmap(self, ax):\n # Test visualization\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.collections), 1) # 1 heatmap\n cbar = ax.collections[0].colorbar\n self.assertTrue(cbar is not None)\n self.assertTrue(cbar.ax.get_ylabel(), \"Normalized Value\")\n self.assertEqual(ax.collections[0].cmap.name, \"YlGnBu\")\n def test_case_1(self):\n # Test with random data\n data = pd.DataFrame(\n np.random.rand(100, 5),\n columns=self.expected_columns,\n )\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, self.expected_columns)\n self._check_data_value(normalized_data)\n self._check_heatmap(ax)\n def test_case_2(self):\n # Test with data having all zeros\n data = pd.DataFrame(\n np.zeros((100, 5)),\n columns=self.expected_columns,\n )\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, self.expected_columns)\n self._check_heatmap(ax)\n # Check if all values in normalized data are zero\n self.assertTrue((normalized_data.values == 0).all())\n def test_case_3(self):\n # Test with data having incremental values\n data = pd.DataFrame(\n np.arange(500).reshape(100, 5),\n columns=self.expected_columns,\n )\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, self.expected_columns)\n self._check_data_value(normalized_data)\n self._check_heatmap(ax)\n def test_case_4(self):\n # Test with data having decremental values\n data = pd.DataFrame(\n np.arange(500, 0, -1).reshape(100, 5),\n columns=self.expected_columns,\n )\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, self.expected_columns)\n self._check_data_value(normalized_data)\n self._check_heatmap(ax)\n def test_case_5(self):\n # Test single valid column\n data = pd.DataFrame(np.random.rand(100, 1), columns=[\"Feature1\"])\n normalized_data, ax = f_358(data)\n self._check_data_structure(normalized_data, [\"Feature1\"])\n self._check_data_value(normalized_data)\n self._check_heatmap(ax)\n def test_case_6(self):\n # Test should fail when inputs are invalid - string column\n data = pd.DataFrame(\n {\"Feature1\": np.random.rand(100), \"Feature2\": [\"string\"] * 100}\n )\n with self.assertRaises(ValueError):\n f_358(data)\n def test_case_7(self):\n # Test should fail when inputs are invalid - empty dataframe\n data = pd.DataFrame()\n with self.assertRaises(ValueError):\n f_358(data)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.figure", "seaborn.heatmap", "pandas.DataFrame", "sklearn.preprocessing.MinMaxScaler", "matplotlib.pyplot.Axes"], "libs": ["sklearn", "seaborn", "pandas", "matplotlib"], "doc": {"description": ["Normalize the data and visualize it using a heatmap.", "This function takes a pandas DataFrame, normalizes the data to a range [0, 1], and then visualizes this", "normalized data using a seaborn heatmap. The heatmap uses the \"YlGnBu\" colormap to represent normalized", "values and includes a color bar labeled \"Normalized Value\" to indicate the range of data values.", "It returns both the normalized data and the heatmap plot."], "note": [], "params": ["data (pd.DataFrame): The input data with multiple features in columns."], "returns": ["pd.DataFrame: Normalized data.", "plt.Axes: Heatmap plot of the normalized data."], "reqs": ["pandas", "numpy", "matplotlib.pyplot", "seaborn"], "raises": [], "example": [">>> df = pd.DataFrame([[1,1,1], [2,2,2], [3,3,3]], columns=['Feature1', 'Feature2', 'Feature3'])", ">>> normalized_df, _ = f_358(df)", ">>> type(normalized_df)", "", ">>> normalized_df['Feature1'].iloc[0] # Returns a normalized value between 0 and 1", "0.0"]}} {"task_id": "f_527", "prompt": "from itertools import combinations\nimport math\n\ndef f_527(seq, letter_weight_dict):\n \"\"\"\n Find the subsequence in a string that has the maximum total weight based on the weights given for each character. \n The weights are assigned randomly and a subsequence is a sequence that can be derived from another sequence by deleting some elements without changing the order of the remaining elements.\n\n Parameters:\n - seq (str): The input string.\n - letter_weight_dict (dict): A dictionary with the weights for each character.\n\n Returns:\n - str: The subsequence with the highest weight.\n\n Requirements:\n - itertools\n - math\n\n Example:\n >>> f_527('abc', {'a': 1, 'b': 2, 'c': 3})\n 'abc'\n >>> f_527('aabc', {'a': 10, 'b': -5, 'c': 3})\n 'aac'\n \"\"\"", "canonical_solution": " max_weight = -math.inf\n max_subseq = ''\n\n for r in range(1, len(seq) + 1):\n for subseq in combinations(seq, r):\n weight = sum(letter_weight_dict[c] for c in subseq)\n if weight > max_weight:\n max_weight = weight\n max_subseq = ''.join(subseq)\n\n return max_subseq", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def base(self, seq, letter_weight_dict, correct_seq):\n # Run function\n result = f_527(seq, letter_weight_dict)\n # Check result\n self.assertTrue(isinstance(result, str))\n self.assertEqual(result, correct_seq)\n def test_case_1(self):\n self.base('abc', {'a': 1, 'b': 2, 'c': 3}, 'abc')\n \n def test_case_2(self):\n self.base('aabc', {'a': 10, 'b': -5, 'c': 3}, 'aac')\n def test_case_3(self):\n self.base('zx', {'x': 1, 'z': 2}, 'zx')\n \n def test_case_4(self):\n self.base('lfhah', {'a': 1, 'f': 2, 'h': -1, 'l': 4}, 'lfa')\n \n def test_case_5(self):\n self.base('a', {'a': 1}, 'a')", "apis": ["itertools.combinations", "math.inf"], "libs": ["itertools", "math"], "doc": {"description": ["Find the subsequence in a string that has the maximum total weight based on the weights given for each character.", "The weights are assigned randomly and a subsequence is a sequence that can be derived from another sequence by deleting some elements without changing the order of the remaining elements."], "note": [], "params": ["seq (str): The input string.", "letter_weight_dict (dict): A dictionary with the weights for each character."], "returns": ["str: The subsequence with the highest weight."], "reqs": ["itertools", "math"], "raises": [], "example": [">>> f_527('abc', {'a': 1, 'b': 2, 'c': 3})", "'abc'", ">>> f_527('aabc', {'a': 10, 'b': -5, 'c': 3})", "'aac'"]}} {"task_id": "f_563", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_563(tuples_list, columns):\n \"\"\"\n Convert a list of tuples into a Pandas DataFrame, perform a default scaling in each column, and return the transformed DataFrame.\n \n Parameters:\n - tuples_list (list): The list of tuples.\n - columns (list): The list of column names.\n \n Returns:\n - df_scaled (DataFrame): A pandas DataFrame containing the scaled versions of the original data.\n\n Requirements:\n - pandas\n - sklearn\n \n Example:\n >>> df = f_563([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], ['A', 'B', 'C', 'D'])\n >>> print(df)\n A B C D\n 0 -1.224745 -1.224745 -1.224745 -1.224745\n 1 0.000000 0.000000 0.000000 0.000000\n 2 1.224745 1.224745 1.224745 1.224745\n \"\"\"", "canonical_solution": " df = pd.DataFrame(tuples_list, columns=columns)\n scaler = StandardScaler()\n df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)\n\n return df_scaled", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = f_563([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], ['A', 'B', 'C', 'D'])\n self.assertEqual(df.shape, (3, 4))\n self.assertEqual(df.columns.tolist(), ['A', 'B', 'C', 'D'])\n self.assertEqual(df['A'].tolist(), [-1.224744871391589, 0.0, 1.224744871391589])\n def test_case_2(self):\n df = f_563([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], ['A', 'B', 'C', 'D'])\n self.assertEqual(df.shape, (3, 4))\n self.assertEqual(df.columns.tolist(), ['A', 'B', 'C', 'D'])\n self.assertEqual(df['B'].tolist(), [-1.224744871391589, 0.0, 1.224744871391589])\n def test_case_3(self):\n df = f_563([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], ['A', 'B', 'C', 'D'])\n self.assertEqual(df.shape, (3, 4))\n self.assertEqual(df.columns.tolist(), ['A', 'B', 'C', 'D'])\n self.assertEqual(df['C'].tolist(), [-1.224744871391589, 0.0, 1.224744871391589])\n def test_case_4(self):\n df = f_563([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], ['A', 'B', 'C', 'D'])\n self.assertEqual(df.shape, (3, 4))\n self.assertEqual(df.columns.tolist(), ['A', 'B', 'C', 'D'])\n self.assertEqual(df['D'].tolist(), [-1.224744871391589, 0.0, 1.224744871391589])\n def test_case_5(self):\n df = f_563([(0, 0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0)], ['A', 'B', 'C', 'D'])\n self.assertEqual(df.shape, (3, 4))\n self.assertEqual(df.columns.tolist(), ['A', 'B', 'C', 'D'])\n self.assertEqual(df['A'].tolist(), [0.0, 0.0, 0.0])", "apis": ["pandas.DataFrame", "sklearn.preprocessing.StandardScaler"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Convert a list of tuples into a Pandas DataFrame, perform a default scaling in each column, and return the transformed DataFrame."], "note": [], "params": ["tuples_list (list): The list of tuples.", "columns (list): The list of column names."], "returns": ["df_scaled (DataFrame): A pandas DataFrame containing the scaled versions of the original data."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> df = f_563([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], ['A', 'B', 'C', 'D'])", ">>> print(df)", "A B C D", "0 -1.224745 -1.224745 -1.224745 -1.224745", "1 0.000000 0.000000 0.000000 0.000000", "2 1.224745 1.224745 1.224745 1.224745"]}} -{"task_id": "f_346", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_346(P, T):\n \"\"\"\n Calculate the product of a matrix \"P\" and a 3D tensor \"T\" with numpy and then visualize the\n result in 3D with matplotlib. Note: This function only accepts numpy matrices/arrays.\n\n Parameters:\n P (numpy.ndarray): The input matrix with shape (N, 3), where N is the number of rows.\n T (numpy.ndarray): The input tensor with shape (3, 3, 3).\n\n Returns:\n tuple:\n - result (numpy.ndarray): The product of matrix P and tensor T with shape (N, 3).\n - ax (mpl_toolkits.mplot3d.axes3d.Axes3D): The 3D visualization of the result.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1]])\n >>> T = np.random.rand(3, 3, 3)\n >>> result, ax = f_346(P, T)\n >>> type(result)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if not (isinstance(P, np.ndarray) and isinstance(T, np.ndarray)):\n raise TypeError(\"Expected inputs to be numpy arrays\")\n\n # Compute the matrix-tensor product to ensure the result has the desired shape\n result = np.einsum(\"ij,jkl->ik\", P, T)\n\n # Visualize the result in 3D\n fig = plt.figure()\n ax = fig.add_subplot(111, projection=\"3d\")\n ax.scatter(result[:, 0], result[:, 1], result[:, 2])\n\n # Return the result and the 3D visualization\n return result, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n self.test_P = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n self.test_T = np.random.rand(3, 3, 3)\n def check_result_correctness(self, P, T, result):\n # Manually compute the expected result for the matrix-tensor product\n expected_result = np.einsum(\"ij,jkl->ik\", P, T)\n return np.allclose(result, expected_result)\n def test_case_1(self):\n # Test output visualization\n _, ax = f_346(self.test_P, self.test_T)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"\")\n self.assertEqual(ax.get_xlabel(), \"\")\n self.assertEqual(ax.get_ylabel(), \"\")\n ax.set_title(\"Test Title\")\n ax.set_xlabel(\"X Label\")\n ax.set_ylabel(\"Y Label\")\n self.assertEqual(ax.get_title(), \"Test Title\")\n self.assertEqual(ax.get_xlabel(), \"X Label\")\n self.assertEqual(ax.get_ylabel(), \"Y Label\")\n def test_case_2(self):\n # Test result correctness\n result, _ = f_346(self.test_P, self.test_T)\n self.assertTrue(self.check_result_correctness(self.test_P, self.test_T, result))\n self.assertEqual(result.shape, (self.test_P.shape[0], 3))\n def test_case_3(self):\n # Test with zeros and negative values\n P = np.array([[0, 0, 0]])\n T = np.random.rand(3, 3, 3) - 0.5\n result, _ = f_346(P, T)\n self.assertTrue(np.all(result == 0))\n def test_case_4(self):\n # Test with non-numeric data\n P = np.array([[\"a\", \"b\", \"c\"], [1, 2, 3]])\n with self.assertRaises(Exception):\n f_346(P, self.test_T)\n def test_case_5(self):\n # Test incompatible shapes\n P = np.array([[1, 2], [3, 4]])\n with self.assertRaises(Exception):\n f_346(P, self.test_T)\n def test_case_6(self):\n # Test incompatible input types\n with self.assertRaises(Exception):\n f_346([1, 2], [2, 1])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.einsum", "numpy.ndarray", "matplotlib.pyplot.figure"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Calculate the product of a matrix \"P\" and a 3D tensor \"T\" with numpy and then visualize the", "result in 3D with matplotlib. Note: This function only accepts numpy matrices/arrays."], "note": [], "params": ["P (numpy.ndarray): The input matrix with shape (N, 3), where N is the number of rows.", "T (numpy.ndarray): The input tensor with shape (3, 3, 3)."], "returns": ["tuple:", "result (numpy.ndarray): The product of matrix P and tensor T with shape (N, 3).", "ax (mpl_toolkits.mplot3d.axes3d.Axes3D): The 3D visualization of the result."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1]])", ">>> T = np.random.rand(3, 3, 3)", ">>> result, ax = f_346(P, T)", ">>> type(result)", "", ">>> type(ax)", ""]}} -{"task_id": "f_359", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_359(L):\n \"\"\"\n Draw a histogram of all elements in a nested list 'L' and return the Axes object of the plot.\n\n The function first uses Numpy to handle array operations, checking for correct input type\n while ignoring empty sublists. It then plots the histogram using pandas, assigning\n each unique value its own bin and plotting the histogram with rwidth 0.8.\n\n Parameters:\n L (list of list of int): Nested list of integers.\n\n Returns:\n ax (matplotlib.axes._axes.Axes): The Axes object of the histogram plot.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> ax = f_359([[1,2,3],[4,5,6]])\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5'), Text(6.0, 0, '6'), Text(7.0, 0, '7')]\n \"\"\"", "canonical_solution": "\n flattened = np.concatenate([l for l in L if l])\n if not np.issubdtype(flattened.dtype, np.integer):\n raise TypeError(\"Expected list of list of int\")\n bins = len(np.unique(flattened))\n ax = pd.Series(flattened).plot(kind=\"hist\", rwidth=0.8, bins=bins)\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test non-overlapping numbers split into multi-item lists\n ax = f_359([[1, 2, 3], [4, 5, 6]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 12)\n def test_case_2(self):\n # Test non-overlapping numbers in individual lists\n ax = f_359([[1], [2], [3], [4], [5], [6]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 6)\n def test_case_3(self):\n # Test overlapping numbers split into multi-item lists\n ax = f_359([[1, 1], [2, 2], [3, 3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_4(self):\n # Test overlapping numbers that repeat across items\n ax = f_359([[1, 2], [1, 3], [2, 3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_5(self):\n # Test overlapping numbers in individual lists\n ax = f_359([[1], [1], [2], [2], [3], [3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_6(self):\n # Test case with uneven segment sizes\n ax = f_359([[10, 20, 30], [40]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 4)\n def test_case_7(self):\n # Test negative integers\n ax = f_359([[-1, -2], [-2, -3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_8(self):\n # Test larger integers\n ax = f_359([[10000, 20000], [30000]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_9(self):\n # Test single element\n ax = f_359([[1]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 1)\n def test_case_10(self):\n # Test handling mix of valid sublists and empty ones\n ax = f_359([[], [1, 2], [], [3, 4], []])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 4)\n def test_case_11(self):\n # Test handling NumPy array conversion\n ax = f_359([[np.int64(1)], [np.int32(2)], [3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_12(self):\n # Test handling invalid input - fully empty lists, excessive nesting\n with self.assertRaises(ValueError):\n f_359([[], [], []])\n with self.assertRaises(ValueError):\n f_359([[[1]], [2], [3]])\n def test_case_13(self):\n # Test handling invalid input - non-int types\n with self.assertRaises(TypeError):\n f_359([1.1, 2.2], [3.3])\n with self.assertRaises(TypeError):\n f_359([\"1\", \"2\"], [\"3\", \"4\"])\n with self.assertRaises(TypeError):\n f_359([[1, 2], [\"a\", \"b\"]])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.issubdtype", "pandas.Series", "numpy.unique", "numpy.integer", "numpy.concatenate"], "libs": ["numpy", "pandas"], "doc": {"description": ["Draw a histogram of all elements in a nested list 'L' and return the Axes object of the plot.", "The function first uses Numpy to handle array operations, checking for correct input type", "while ignoring empty sublists. It then plots the histogram using pandas, assigning", "each unique value its own bin and plotting the histogram with rwidth 0.8."], "note": [], "params": ["L (list of list of int): Nested list of integers."], "returns": ["ax (matplotlib.axes._axes.Axes): The Axes object of the histogram plot."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> ax = f_359([[1,2,3],[4,5,6]])", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5'), Text(6.0, 0, '6'), Text(7.0, 0, '7')]"]}} -{"task_id": "f_925", "prompt": "import pandas as pd\nimport seaborn as sns\n\n\ndef f_925(data=None):\n \"\"\"\n Converts string-formatted weights to floats and plots a scatter plot of weight against height.\n\n This function takes a dictionary with two keys: 'Weight_String' and 'Height'. The 'Weight_String' key should \n contain a list of weight values in string format, while the 'Height' key should have a list of corresponding \n height values in numerical format. If the input dictionary is not provided, the function uses a default dataset.\n The function then converts the string-formatted weights into float, and plots a scatter plot to visualize \n the relationship between weight and height.\n \n Parameters:\n - data (dict, optional): A dictionary with keys 'Weight_String' and 'Height'. 'Weight_String' is expected to be \n a list of weight values in string format (e.g., ['60.5', '65.7']), and 'Height' is expected \n to be a list of corresponding numerical height values (e.g., [160, 165]). If no dictionary \n is provided, a default dataset with predetermined values is used.\n Default dictionary:\n {\n 'Weight_String': ['60.5', '65.7', '70.2', '75.9', '80.1'],\n 'Height': [160, 165, 170, 175, 180]\n }\n\n Returns:\n - ax (matplotlib.axes._subplots.AxesSubplot): A scatter plot with weight on the x-axis and height on the y-axis, titled \"Weight vs Height\".\n\n Raises:\n - ValueError: If any of the values in the 'Weight_String' key are not formatted as strings. This validation ensures \n that the weight data is in the expected format for conversion to float.\n\n Requirements:\n - pandas\n - seaborn\n\n Example:\n >>> ax = f_925()\n >>> print(ax.get_title())\n Weight vs Height\n \"\"\"", "canonical_solution": " if data is None:\n data = {\n \"Weight_String\": [\"60.5\", \"65.7\", \"70.2\", \"75.9\", \"80.1\"],\n \"Height\": [160, 165, 170, 175, 180],\n }\n\n df = pd.DataFrame(data)\n\n # Validate weight values are strings\n if not all(isinstance(weight, str) for weight in df[\"Weight_String\"]):\n raise ValueError(\"Weights must be provided as strings.\")\n\n # Convert string weights to floats\n df[\"Weight_Float\"] = df[\"Weight_String\"].astype(float)\n\n # Plotting the scatter plot\n ax = sns.scatterplot(data=df, x=\"Weight_Float\", y=\"Height\")\n ax.set_title(\"Weight vs Height\")\n return ax", "test": "import unittest\nimport pandas as pd\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_925\"\"\"\n def test_default_data(self):\n \"\"\"Test f_925 with its default data.\"\"\"\n result = f_925()\n self.assertIsInstance(result, Axes)\n def test_custom_data(self):\n \"\"\"Test f_925 with custom data.\"\"\"\n custom_data = {\n \"Weight_String\": [\"50.5\", \"55.7\", \"60.2\"],\n \"Height\": [150, 155, 160],\n }\n result = f_925(custom_data)\n self.assertIsInstance(result, Axes)\n def test_incorrect_data_type(self):\n \"\"\"Test f_925 with incorrect data types in Weight_String.\"\"\"\n incorrect_data = {\n \"Weight_String\": [\n 60.5,\n 65.7,\n 70.2,\n ], # Intentionally using floats instead of strings\n \"Height\": [160, 165, 170],\n }\n with self.assertRaises(ValueError):\n f_925(incorrect_data)\n def test_empty_data(self):\n \"\"\"Test f_925 with empty data.\"\"\"\n empty_data = {\"Weight_String\": [], \"Height\": []}\n result = f_925(empty_data)\n self.assertIsInstance(result, Axes)\n def test_mismatched_data_length(self):\n \"\"\"Test f_925 with mismatched lengths of Weight_String and Height.\"\"\"\n mismatched_data = {\n \"Weight_String\": [\"60.5\", \"65.7\"], # Less weights than heights\n \"Height\": [160, 165, 170],\n }\n with self.assertRaises(ValueError):\n f_925(mismatched_data)", "apis": ["pandas.DataFrame", "seaborn.scatterplot"], "libs": ["pandas", "seaborn"], "doc": {"description": ["Converts string-formatted weights to floats and plots a scatter plot of weight against height.", "This function takes a dictionary with two keys: 'Weight_String' and 'Height'. The 'Weight_String' key should", "contain a list of weight values in string format, while the 'Height' key should have a list of corresponding", "height values in numerical format. If the input dictionary is not provided, the function uses a default dataset.", "The function then converts the string-formatted weights into float, and plots a scatter plot to visualize", "the relationship between weight and height."], "note": [], "params": ["data (dict, optional): A dictionary with keys 'Weight_String' and 'Height'. 'Weight_String' is expected to be", "a list of weight values in string format (e.g., ['60.5', '65.7']), and 'Height' is expected", "to be a list of corresponding numerical height values (e.g., [160, 165]). If no dictionary", "is provided, a default dataset with predetermined values is used.", "Default dictionary:", "{", "'Weight_String': ['60.5', '65.7', '70.2', '75.9', '80.1'],", "'Height': [160, 165, 170, 175, 180]", "}"], "returns": ["ax (matplotlib.axes._subplots.AxesSubplot): A scatter plot with weight on the x-axis and height on the y-axis, titled \"Weight vs Height\"."], "reqs": ["pandas", "seaborn"], "raises": ["ValueError: If any of the values in the 'Weight_String' key are not formatted as strings. This validation ensures", "that the weight data is in the expected format for conversion to float."], "example": [">>> ax = f_925()", ">>> print(ax.get_title())", "Weight vs Height"]}} -{"task_id": "f_774", "prompt": "from collections import Counter\nimport re\n\ndef f_774(word: str) -> list:\n \"\"\"\n Finds the most common two-letter combination in a given, cleaned word (lowercased and alphabetic characters only) \n and returns its frequency. The search is case-insensitive and ignores non-alphabetic characters.\n \n Requirements:\n - collections.Counter\n - re\n \n Parameters:\n - word (str): The input string containing the word to analyze. The word should have a length of at least 2 to form pairs.\n \n Returns:\n - list: A list containing a single tuple. The tuple consists of the most frequent two-letter combination (str) \n and its frequency (int). Returns an empty list if the word has fewer than 2 letters, or after cleaning, \n the word has fewer than 2 alphabetic characters.\n \n Examples:\n >>> f_774(\"aaBBcc\")\n [('aa', 1)]\n >>> f_774(\"abc!abc\")\n [('ab', 2)]\n >>> f_774(\"a\")\n []\n >>> f_774(\"abcd\")\n [('ab', 1)]\n >>> f_774(\"a1b2c3\")\n [('ab', 1)]\n \"\"\"", "canonical_solution": " # Clean the word: lowercase and keep alphabetic characters only\n clean_word = re.sub('[^a-z]', '', word.lower())\n \n if len(clean_word) < 2:\n return []\n \n pairs = [clean_word[i:i+2] for i in range(len(clean_word) - 1)]\n pair_counter = Counter(pairs)\n most_common = pair_counter.most_common(1)\n \n # This check ensures we return the result directly from most_common without additional filtering\n return most_common", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_repeating_pairs(self):\n self.assertEqual(f_774(\"aabbcc\"), [('aa', 1)], \"Should identify single repeating pair\")\n \n def test_mixed_repeating_pairs(self):\n self.assertEqual(f_774(\"abcabc\"), [('ab', 2)], \"Should identify most frequent pair in mixed sequence\")\n \n def test_single_character(self):\n self.assertEqual(f_774(\"a\"), [], \"Should return empty list for single character\")\n \n def test_unique_pairs(self):\n self.assertEqual(f_774(\"abcdef\"), [('ab', 1)], \"Should handle all unique pairs\")\n \n def test_empty_string(self):\n self.assertEqual(f_774(\"\"), [], \"Should return empty list for empty string\")\n def test_case_insensitive(self):\n # Corrected the expected count to match the correct behavior of the function\n self.assertEqual(f_774(\"aAaAbbBB\"), [('aa', 3)], \"Should be case-insensitive\")\n def test_ignore_non_alphabetic(self):\n self.assertEqual(f_774(\"abc123abc!\"), [('ab', 2)], \"Should ignore non-alphabetic characters\")", "apis": ["re.sub", "collections.Counter"], "libs": ["collections", "re"], "doc": {"description": ["Finds the most common two-letter combination in a given, cleaned word (lowercased and alphabetic characters only)", "and returns its frequency. The search is case-insensitive and ignores non-alphabetic characters."], "note": [], "params": ["word (str): The input string containing the word to analyze. The word should have a length of at least 2 to form pairs."], "returns": ["list: A list containing a single tuple. The tuple consists of the most frequent two-letter combination (str)", "and its frequency (int). Returns an empty list if the word has fewer than 2 letters, or after cleaning,", "the word has fewer than 2 alphabetic characters."], "reqs": ["collections.Counter", "re"], "raises": [], "example": ["Examples:", ">>> f_774(\"aaBBcc\")", "[('aa', 1)]", ">>> f_774(\"abc!abc\")", "[('ab', 2)]", ">>> f_774(\"a\")", "[]", ">>> f_774(\"abcd\")", "[('ab', 1)]", ">>> f_774(\"a1b2c3\")", "[('ab', 1)]"]}} -{"task_id": "f_859", "prompt": "import requests\nfrom PIL import Image\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_859(url: str) -> \"matplotlib.axes._axes.Axes\":\n \"\"\"\n Downloads an image from the specified URL, converts it to grayscale, and generates a histogram of its grayscale values.\n\n Parameters:\n - url (str): The URL of the image to be downloaded. Must be a valid URL pointing to an image.\n\n Returns:\n - matplotlib.axes._axes.Axes: The Axes object of the generated histogram.\n\n Raises:\n - ValueError: If the URL is invalid or if there's an error downloading the image. Error message will specify the download issue.\n - IOError: If there's an error in opening or processing the downloaded image. Error message will specify the processing issue.\n\n Requirements:\n - requests\n - PIL\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_859(\"https://www.example.com/myimage.jpg\")\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " response = None # Initialize response to None\n # Validate the URL\n if not isinstance(url, str) or not url:\n raise ValueError(\"Invalid URL provided.\")\n\n # Download the image with error handling\n try:\n response = requests.get(url, stream=True, timeout=10)\n response.raise_for_status()\n img = Image.open(response.raw).convert(\"L\")\n except requests.RequestException as e:\n raise ValueError(f\"Error downloading the image: {e}\") from e\n except IOError as e:\n raise IOError(f\"Error processing the image: {e}\") from e\n finally:\n if response: # Check if response is not None before closing\n response.close()\n\n # Convert the image to a numpy array\n img_array = np.array(img)\n\n # Create the histogram and return the Axes object\n _, ax = plt.subplots()\n ax.hist(img_array.ravel(), bins=256, color=\"gray\", alpha=0.7)\n ax.set_title(\"Grayscale Histogram\")\n return ax", "test": "import unittest\nfrom unittest.mock import patch, MagicMock, Mock\nimport requests\nimport matplotlib\nfrom PIL import Image\nimport io\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_859.\"\"\"\n def create_mock_image(self):\n \"\"\"\n Creates a mock grayscale image in memory.\n \"\"\"\n img = Image.new(\"L\", (100, 100), color=\"gray\")\n img_byte_arr = io.BytesIO()\n img.save(img_byte_arr, format=\"JPEG\")\n img_byte_arr.seek(0) # Important: move to the start of the BytesIO object\n return img_byte_arr\n @patch(\"requests.get\")\n def test_valid_image_url(self, mock_get):\n \"\"\"\n Test if the function correctly processes a valid image URL and returns a matplotlib Axes object with the correct title.\n \"\"\"\n mock_img = self.create_mock_image()\n mock_get.return_value = Mock(ok=True)\n mock_get.return_value.raw = mock_img\n ax = f_859(\"https://www.google.com/images/srpr/logo11w.png\")\n self.assertIsInstance(\n ax,\n matplotlib.axes._axes.Axes,\n \"Return type should be matplotlib.axes._axes.Axes\",\n )\n self.assertEqual(\n ax.get_title(),\n \"Grayscale Histogram\",\n \"Histogram should have the title 'Grayscale Histogram'\",\n )\n @patch(\"requests.get\")\n def test_invalid_image_url(self, mock_get):\n \"\"\"\n Test if the function raises a ValueError when provided with an invalid URL.\n \"\"\"\n mock_get.side_effect = requests.exceptions.RequestException\n with self.assertRaises(ValueError):\n f_859(\"invalid_url\")\n @patch(\"requests.get\")\n def test_histogram_bins(self, mock_get):\n \"\"\"\n Test if the histogram generated by the function contains the correct number of bins.\n \"\"\"\n mock_img = self.create_mock_image()\n mock_get.return_value = Mock(ok=True)\n mock_get.return_value.raw = mock_img\n ax = f_859(\"https://www.google.com/images/srpr/logo11w.png\")\n n, bins, _ = ax.hist([], bins=256)\n self.assertEqual(len(bins), 257, \"There should be 257 bin edges for 256 bins\")\n @patch(\"requests.get\")\n def test_histogram_data_range(self, mock_get):\n \"\"\"\n Test if the data range of the histogram is appropriate for a grayscale image (0 to 255).\n \"\"\"\n mock_img = self.create_mock_image()\n mock_get.return_value = Mock(ok=True)\n mock_get.return_value.raw = mock_img\n ax = f_859(\"https://www.google.com/images/srpr/logo11w.png\")\n n, bins, _ = ax.hist([], bins=256)\n self.assertTrue(\n bins[0] >= 0 and bins[-1] <= 255, \"Data range should be between 0 and 255\"\n )\n @patch(\"requests.get\")\n def test_empty_url(self, mock_get):\n \"\"\"\n Test if the function raises a ValueError when provided with an empty URL string.\n \"\"\"\n mock_get.side_effect = requests.exceptions.RequestException\n with self.assertRaises(ValueError):\n f_859(\"\")\n @patch(\"requests.get\")\n @patch(\"PIL.Image.open\")\n def test_ioerror_image_processing(self, mock_image_open, mock_get):\n \"\"\"\n Test if the function raises an IOError when there is an error in processing the image.\n \"\"\"\n # Mock requests.get to return a valid response\n mock_get.return_value = MagicMock(ok=True)\n mock_get.return_value.raw = MagicMock()\n # Mock PIL.Image.open to raise IOError\n mock_image_open.side_effect = IOError(\"Mocked IOError\")\n with self.assertRaises(IOError) as context:\n f_859(\"https://www.example.com/image.jpg\")\n self.assertEqual(\n str(context.exception), \"Error processing the image: Mocked IOError\"\n )\n def tearDown(self):\n plt.close()", "apis": ["PIL.Image.open", "requests.get", "requests.RequestException", "numpy.array", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "requests", "PIL"], "doc": {"description": ["Downloads an image from the specified URL, converts it to grayscale, and generates a histogram of its grayscale values."], "note": [], "params": ["url (str): The URL of the image to be downloaded. Must be a valid URL pointing to an image."], "returns": ["matplotlib.axes._axes.Axes: The Axes object of the generated histogram."], "reqs": ["requests", "PIL", "numpy", "matplotlib.pyplot"], "raises": ["ValueError: If the URL is invalid or if there's an error downloading the image. Error message will specify the download issue.", "IOError: If there's an error in opening or processing the downloaded image. Error message will specify the processing issue."], "example": [">>> ax = f_859(\"https://www.example.com/myimage.jpg\")", ">>> type(ax)", ""]}} -{"task_id": "f_339", "prompt": "import pandas as pd\nimport re\nimport random\n\n\ndef f_339(s: str, seed: int = 0) -> pd.DataFrame:\n \"\"\"\n Generate a Pandas DataFrame of products with their ID, quantity, code, price, product, and description\n based on a specified string of product data.\n\n The input string is expected to be divided into segments by newlines. Each segment is expected to\n be further split into parts by whitespace: ID, quantity, code, price, and a product description.\n The function will remove trailing whitespaces in each field and assign a product name per unique code.\n Product name is randomly sampled from: ['Apple', 'Banana', 'Orange', 'Pear', 'Grape'].\n The same product name will be assigned to each code for each input s, however different codes can be\n mapped to the same name.\n\n Parameters:\n - s (str): Product data string split by newline, then whitespace.\n Expected format per segment: ' '\n If incomplete, this function raises ValueError.\n - seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n - data_df (pd.DataFrame): DataFrame with columns: ['ID', 'Quantity', 'Code', 'Price', 'Product', 'Description'].\n Quantity and Price are expected to be integers.\n\n Requirements:\n - pandas\n - re\n - random\n\n Examples:\n >>> s = '1 10 A10B 100 This is a description with spaces'\n >>> df = f_339(s)\n >>> df\n ID Quantity Code Price Product Description\n 0 1 10 A10B 100 Pear This is a description with spaces\n\n >>> s = '1 10 A10B 100 This is a description with spaces\\\\n2 20 B20C 200 Another description example'\n >>> df = f_339(s)\n >>> df\n ID Quantity Code Price Product Description\n 0 1 10 A10B 100 Pear This is a description with spaces\n 1 2 20 B20C 200 Pear Another description example\n \"\"\"", "canonical_solution": "\n if not s:\n raise ValueError(\"Incomplete data provided.\")\n\n random.seed(seed)\n\n products = [\"Apple\", \"Banana\", \"Orange\", \"Pear\", \"Grape\"]\n code_to_product = dict()\n\n data_list = []\n segments = [segment.strip() for segment in s.split(\"\\n\")]\n for segment in segments:\n if segment:\n elements = re.split(r\"\\s+\", segment.strip(), 4)\n if len(elements) < 5:\n raise ValueError(\"Incomplete data provided.\")\n id, quantity, code, price, description = elements\n product = code_to_product.get(code, random.choice(products))\n data_list.append([id, quantity, code, price, product, description])\n df = pd.DataFrame(\n data_list, columns=[\"ID\", \"Quantity\", \"Code\", \"Price\", \"Product\", \"Description\"]\n )\n df[\"Quantity\"] = df[\"Quantity\"].astype(int)\n df[\"Price\"] = df[\"Price\"].astype(int)\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.df1 = pd.DataFrame(\n {\n \"ID\": [\"1\"],\n \"Quantity\": [\"10\"],\n \"Code\": [\"A10B\"],\n \"Price\": [\"100\"],\n \"Description\": [\"This is a description with spaces\"],\n }\n )\n self.df2 = pd.DataFrame(\n {\n \"ID\": [\"2\"],\n \"Quantity\": [\"15\"],\n \"Code\": [\"B20C\"],\n \"Price\": [\"200\"],\n \"Description\": [\"Another description with spaces\"],\n }\n )\n self.df_multiple = pd.concat([self.df1, self.df2]).reset_index(drop=True)\n for col in [\"Quantity\", \"Price\"]:\n self.df1[col] = self.df1[col].astype(int)\n self.df2[col] = self.df2[col].astype(int)\n self.df_multiple[col] = self.df_multiple[col].astype(int)\n def _test_most_columns(self, df1, df2):\n columns_to_test = [\"ID\", \"Quantity\", \"Code\", \"Price\", \"Description\"]\n for col in columns_to_test:\n pd.testing.assert_series_equal(df1[col], df2[col])\n def test_case_1(self):\n # Test basic structure and data correctness\n input_str = \"1 10 A10B 100 This is a description with spaces\"\n result = f_339(input_str)\n self.assertIsInstance(result, pd.DataFrame)\n self._test_most_columns(result, self.df1)\n def test_case_2(self):\n # Test multiline basic structure and correctness\n input_str = \"\\n\".join(\n [\n \"1 10 A10B 100 This is a description with spaces\",\n \"2 15 B20C 200 Another description with spaces\",\n ]\n )\n result = f_339(input_str)\n self._test_most_columns(result, self.df_multiple)\n def test_case_3(self):\n # Test multiline with trailing whitespaces\n input_str = \"\\n\".join(\n [\n \"1 10 A10B 100 This is a description with spaces \",\n \"2 15 B20C 200 Another description with spaces \",\n ]\n )\n result = f_339(input_str)\n self._test_most_columns(result, self.df_multiple)\n def test_case_4(self):\n # Test behavior with extra spaces in the input string\n input_str = \"\\n\".join(\n [\n \"1 10 A10B 100 This is a description with spaces\",\n \"2 15 B20C 200 Another description with spaces \",\n ]\n )\n result = f_339(input_str)\n self._test_most_columns(result, self.df_multiple)\n def test_case_5(self):\n # Test code to product mapping when there are duplicates\n input_str = \"\\n\".join(\n [\n \"1 10 A10B 100 This is a description with spaces\",\n \"2 15 A10B 200 Another description with spaces\",\n ]\n )\n result = f_339(input_str)\n product_names = result[\"Product\"]\n self.assertEqual(product_names.iloc[0], product_names.iloc[1])\n def test_case_6(self):\n # Test behavior with empty input string\n input_str = \"\"\n with self.assertRaises(ValueError):\n f_339(input_str)\n def test_case_7(self):\n # Test behavior with incomplete input string\n input_str = \"1 10\"\n with self.assertRaises(ValueError):\n f_339(input_str)", "apis": ["re.split", "pandas.DataFrame", "random.seed", "random.choice"], "libs": ["pandas", "re", "random"], "doc": {"description": ["Generate a Pandas DataFrame of products with their ID, quantity, code, price, product, and description", "based on a specified string of product data.", "The input string is expected to be divided into segments by newlines. Each segment is expected to", "be further split into parts by whitespace: ID, quantity, code, price, and a product description.", "The function will remove trailing whitespaces in each field and assign a product name per unique code.", "Product name is randomly sampled from: ['Apple', 'Banana', 'Orange', 'Pear', 'Grape'].", "The same product name will be assigned to each code for each input s, however different codes can be", "mapped to the same name.", ">>> s = '1 10 A10B 100 This is a description with spaces\\\\n2 20 B20C 200 Another description example'", ">>> df = f_339(s)", ">>> df", "ID Quantity Code Price Product Description", "0 1 10 A10B 100 Pear This is a description with spaces", "1 2 20 B20C 200 Pear Another description example"], "note": [], "params": ["s (str): Product data string split by newline, then whitespace.", "Expected format per segment: ' '", "If incomplete, this function raises ValueError.", "seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["data_df (pd.DataFrame): DataFrame with columns: ['ID', 'Quantity', 'Code', 'Price', 'Product', 'Description'].", "Quantity and Price are expected to be integers."], "reqs": ["pandas", "re", "random"], "raises": [], "example": ["Examples:", ">>> s = '1 10 A10B 100 This is a description with spaces'", ">>> df = f_339(s)", ">>> df", "ID Quantity Code Price Product Description", "0 1 10 A10B 100 Pear This is a description with spaces"]}} -{"task_id": "f_840", "prompt": "import urllib.request\nimport os\nimport zipfile\n\n# Constants\nTARGET_DIR = \"downloaded_files\"\nTARGET_ZIP_FILE = \"downloaded_files.zip\"\n\n\ndef f_840(url):\n \"\"\"\n Download and extract a zip file from a specified URL to a designated directory.\n\n Parameters:\n - url (str): The URL of the zip file.\n\n Returns:\n - str: The path of the directory where the contents of the zip file are extracted.\n\n Requirements:\n - urllib\n - os\n - zipfile\n\n Behavior:\n - If the target directory TARGET_DIR does not exist, it is created.\n - The zip file is downloaded from the given URL and saved locally as TARGET_ZIP_FILE.\n - The local zip file TARGET_ZIP_FILE is deleted after extraction.\n\n Error Handling:\n - The function does not explicitly handle errors that may occur during the download or extraction process.\n Errors such as a failed download, invalid URL, or corrupted zip file will result in an unhandled exception.\n\n Examples:\n >>> f_840(\"http://example.com/files.zip\")\n 'downloaded_files'\n \"\"\"", "canonical_solution": "\n os.makedirs(TARGET_DIR, exist_ok=True)\n\n # context = ssl._create_unverified_context()\n # urllib.request.urlretrieve(url, TARGET_ZIP_FILE, context=context)\n urllib.request.urlretrieve(url, TARGET_ZIP_FILE)\n\n with zipfile.ZipFile(TARGET_ZIP_FILE, \"r\") as zip_ref:\n zip_ref.extractall(TARGET_DIR)\n\n if os.path.exists(TARGET_ZIP_FILE):\n os.remove(TARGET_ZIP_FILE)\n\n return TARGET_DIR", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport os\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_840 function.\"\"\"\n def setUp(self):\n if not os.path.exists(TARGET_DIR):\n os.makedirs(TARGET_DIR)\n if os.path.exists(TARGET_DIR):\n shutil.rmtree(TARGET_DIR)\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_valid_zip_file(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function returns the correct directory path.\"\"\"\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n mock_zipfile.return_value.__enter__.return_value = MagicMock()\n result = f_840(url)\n mock_urlretrieve.assert_called_with(url, TARGET_ZIP_FILE)\n self.assertEqual(result, TARGET_DIR)\n self.assertTrue(os.path.exists(TARGET_DIR))\n @patch(\"urllib.request.urlretrieve\")\n def test_invalid_url(self, mock_urlretrieve):\n \"\"\"Test that the function raises an exception when the URL is invalid.\"\"\"\n mock_urlretrieve.side_effect = Exception\n url = \"https://invalid.url/invalid.zip\"\n with self.assertRaises(Exception):\n f_840(url)\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_non_zip_file(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function raises an exception when the URL does not point to a zip file.\"\"\"\n mock_zipfile.side_effect = zipfile.BadZipFile\n url = \"https://www.sample-videos.com/img/Sample-jpg-image-5mb.jpg\"\n with self.assertRaises(zipfile.BadZipFile):\n f_840(url)\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_cleanup(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function deletes the downloaded zip file after extraction.\"\"\"\n mock_zipfile.return_value.__enter__.return_value = MagicMock()\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n f_840(url)\n self.assertFalse(os.path.exists(TARGET_ZIP_FILE))\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_directory_creation(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function creates a directory to store the extracted files.\"\"\"\n mock_zipfile.return_value.__enter__.return_value = MagicMock()\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n f_840(url)\n self.assertTrue(os.path.exists(TARGET_DIR))\n self.assertTrue(os.path.isdir(TARGET_DIR))\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_zip_extraction_content(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function extracts the contents of the zip file.\"\"\"\n mock_extractall = MagicMock()\n mock_zipfile.return_value.__enter__.return_value.extractall = mock_extractall\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n f_840(url)\n mock_extractall.assert_called_once()\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_file_removal(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function deletes the downloaded zip file even if extraction fails.\"\"\"\n mock_zipfile.return_value.__enter__.return_value = MagicMock()\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n # Create a dummy file to simulate download\n open(TARGET_ZIP_FILE, \"a\").close()\n f_840(url)\n self.assertFalse(os.path.exists(TARGET_ZIP_FILE))\n def tearDown(self):\n if os.path.exists(TARGET_DIR):\n shutil.rmtree(TARGET_DIR)", "apis": ["os.path.exists", "urllib.request.urlretrieve", "zipfile.ZipFile", "os.path", "os.remove", "os.makedirs", "urllib.request"], "libs": ["urllib", "zipfile", "os"], "doc": {"description": ["Download and extract a zip file from a specified URL to a designated directory.", "Behavior:", "- If the target directory TARGET_DIR does not exist, it is created.", "- The zip file is downloaded from the given URL and saved locally as TARGET_ZIP_FILE.", "- The local zip file TARGET_ZIP_FILE is deleted after extraction.", "Error Handling:", "- The function does not explicitly handle errors that may occur during the download or extraction process.", "Errors such as a failed download, invalid URL, or corrupted zip file will result in an unhandled exception."], "note": [], "params": ["url (str): The URL of the zip file."], "returns": ["str: The path of the directory where the contents of the zip file are extracted."], "reqs": ["urllib", "os", "zipfile"], "raises": [], "example": ["Examples:", ">>> f_840(\"http://example.com/files.zip\")", "'downloaded_files'"]}} -{"task_id": "f_802", "prompt": "import string\nimport random\n\n\ndef f_802(text, seed=None):\n \"\"\"\n Transforms the input text by replacing each alphabetic character with a random letter,\n while preserving the case and non-alphabetic characters of the original text.\n\n Parameters:\n - text (str): The input text to be transformed.\n - seed (int, optional): Random seed for reproducibility. Defaults to None (not set).\n\n Returns:\n - str: A transformed string with random letters replacing the alphabetic characters of the input text,\n preserving non-alphabetic characters and the original case.\n\n Requirements:\n - string\n - random\n\n Notes:\n - Alphabet replacements are chosen from ascii characters of the same case as the original.\n\n Example:\n >>> text = 'Hello, world!'\n >>> f_802(text, 0)\n 'Mynbi, qpmzj!'\n \"\"\"", "canonical_solution": "\n def replace_with_random_char(c):\n if c.isalpha():\n if c.islower():\n return random.choice(string.ascii_lowercase)\n else:\n return random.choice(string.ascii_uppercase)\n return c\n\n if seed is not None:\n random.seed(seed)\n return \"\".join(replace_with_random_char(c) for c in text)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test single word\n input_text = \"Hello\"\n output_text = f_802(input_text, seed=1)\n self.assertTrue(\n all(oc.isalpha() == ic.isalpha() for oc, ic in zip(output_text, input_text))\n )\n self.assertEqual(len(output_text), len(input_text))\n def test_case_2(self):\n # Test multiple words and punctuation\n input_text = \"Hello, World!\"\n output_text = f_802(input_text, seed=2)\n self.assertTrue(\n all(oc.isalpha() == ic.isalpha() for oc, ic in zip(output_text, input_text))\n )\n self.assertEqual(len(output_text), len(input_text))\n def test_case_3(self):\n # Test empty string\n input_text = \"\"\n output_text = f_802(input_text, seed=3)\n self.assertEqual(output_text, \"\")\n def test_case_4(self):\n # Test case preservation\n input_text = \"HeLlO\"\n output_text = f_802(input_text, seed=4)\n self.assertTrue(\n all(\n oc.isupper() == ic.isupper() and oc.islower() == ic.islower()\n for oc, ic in zip(output_text, input_text)\n )\n )\n def test_case_5(self):\n # Test numbers, special characters\n input_text = \"1234!@#$\"\n output_text = f_802(input_text, seed=5)\n self.assertEqual(\n output_text, input_text\n ) # Numbers and special characters should remain unchanged\n def test_case_6(self):\n # Test random seed reproducibility\n input_text = \"Colorless green ideas sleep furiously.\"\n output1 = f_802(input_text, seed=123)\n output2 = f_802(input_text, seed=123)\n self.assertEqual(output1, output2)", "apis": ["string.ascii_lowercase", "string.ascii_uppercase", "random.seed", "random.choice"], "libs": ["string", "random"], "doc": {"description": ["Transforms the input text by replacing each alphabetic character with a random letter,", "while preserving the case and non-alphabetic characters of the original text.", "Notes:", "- Alphabet replacements are chosen from ascii characters of the same case as the original."], "note": [], "params": ["text (str): The input text to be transformed.", "seed (int, optional): Random seed for reproducibility. Defaults to None (not set)."], "returns": ["str: A transformed string with random letters replacing the alphabetic characters of the input text,", "preserving non-alphabetic characters and the original case."], "reqs": ["string", "random"], "raises": [], "example": [">>> text = 'Hello, world!'", ">>> f_802(text, 0)", "'Mynbi, qpmzj!'"]}} -{"task_id": "f_798", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport re\nfrom collections import Counter\n\n\ndef f_798(mystrings, text):\n \"\"\"\n Replace spaces in given words with underscores, then plots the frequency of each unique word.\n\n Parameters:\n - mystrings (list of str): List of words/phrases where spaces need to be replaced with underscores.\n - text (str): The text in which modifications are applied and word frequencies are calculated. Must not be empty.\n\n Returns:\n - matplotlib.axes.Axes: The Axes object of the plot.\n\n Raises:\n - ValueError: If the input text is empty.\n\n Requirements:\n - numpy\n - matplotlib\n - re\n - collections\n\n Notes:\n - All operations are case-insensitive.\n - The frequency plot displays each unique word on the x-axis in the order they appear after\n modification with its corresponding frequency on the y-axis.\n\n Examples:\n >>> ax = f_798(['Lorem ipsum', 'consectetur adipiscing'], 'Lorem ipsum dolor sit amet lorem Ipsum')\n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n if not text:\n raise ValueError(\"text cannot be empty.\")\n\n for word in mystrings:\n text = re.sub(word, word.replace(\" \", \"_\"), text, flags=re.IGNORECASE)\n\n word_counts = Counter(text.split())\n\n words, frequencies = zip(*word_counts.items())\n indices = np.arange(len(word_counts))\n\n fig, ax = plt.subplots()\n ax.bar(indices, frequencies)\n ax.set_xticks(indices)\n ax.set_xticklabels(words)\n\n return ax", "test": "import unittest\nimport matplotlib.axes\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n ax = f_798([\"hello\"], \"Hello world!\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n xtick_labels = [label.get_text() for label in ax.get_xticklabels()]\n self.assertTrue(\"hello\" in xtick_labels)\n self.assertTrue(\"world!\" in xtick_labels)\n self.assertEqual(ax.patches[0].get_height(), 1)\n def test_case_2(self):\n # Test underscore on basic case\n ax = f_798([\"hello world\"], \"Hello world!\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(ax.get_xticklabels()[0].get_text(), \"hello_world!\")\n self.assertEqual(ax.patches[0].get_height(), 1)\n def test_case_3(self):\n # Test no mystrings\n ax = f_798([], \"Hello world!\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n xtick_labels = [label.get_text() for label in ax.get_xticklabels()]\n self.assertTrue(\"Hello\" in xtick_labels)\n self.assertTrue(\"world!\" in xtick_labels)\n self.assertEqual(ax.patches[0].get_height(), 1)\n def test_case_4(self):\n # Test basic case with\n large_text = \"Lorem ipsum dolor sit amet \" * 10\n ax = f_798([\"Lorem ipsum\"], large_text)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n xtick_labels = [label.get_text() for label in ax.get_xticklabels()]\n self.assertTrue(\"Lorem_ipsum\" in xtick_labels)\n def test_case_5(self):\n # Tests basic functionality with simple replacement and plotting.\n ax = f_798([\"hello world\"], \"Hello world!\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertIn(\n \"hello_world!\", [label.get_text() for label in ax.get_xticklabels()]\n )\n self.assertEqual(ax.patches[0].get_height(), 1)\n def test_case_6(self):\n # Ensures case insensitivity in replacements.\n ax = f_798([\"Hello World\"], \"hello world! Hello world!\")\n self.assertIn(\n \"Hello_World!\", [label.get_text() for label in ax.get_xticklabels()]\n )\n self.assertEqual(ax.patches[0].get_height(), 2)\n def test_case_7(self):\n # Tests behavior when no replacements should occur.\n ax = f_798([\"not in text\"], \"Hello world!\")\n self.assertNotIn(\n \"not_in_text\", [label.get_text() for label in ax.get_xticklabels()]\n )\n def test_case_8(self):\n # Tests function behavior with empty strings and lists.\n with self.assertRaises(Exception):\n f_798([], \"\")\n def test_case_9(self):\n # Tests functionality with special characters and numbers in `mystrings` and `text`.\n ax = f_798([\"test 123\", \"#$%!\"], \"Test 123 is fun. #$%!\")\n self.assertIn(\"test_123\", [label.get_text() for label in ax.get_xticklabels()])\n self.assertIn(\"#$%!\", [label.get_text() for label in ax.get_xticklabels()])\n def test_case_10(self):\n # Tests handling of duplicates in `mystrings`.\n ax = f_798([\"duplicate\", \"duplicate\"], \"duplicate Duplicate DUPLICATE\")\n self.assertIn(\"duplicate\", [label.get_text() for label in ax.get_xticklabels()])\n self.assertEqual(ax.patches[0].get_height(), 3)", "apis": ["numpy.arange", "re.IGNORECASE", "collections.Counter", "re.sub", "matplotlib.pyplot.subplots"], "libs": ["numpy", "collections", "re", "matplotlib"], "doc": {"description": ["Replace spaces in given words with underscores, then plots the frequency of each unique word.", "Notes:", "- All operations are case-insensitive.", "- The frequency plot displays each unique word on the x-axis in the order they appear after", "modification with its corresponding frequency on the y-axis."], "note": [], "params": ["mystrings (list of str): List of words/phrases where spaces need to be replaced with underscores.", "text (str): The text in which modifications are applied and word frequencies are calculated. Must not be empty."], "returns": ["matplotlib.axes.Axes: The Axes object of the plot."], "reqs": ["numpy", "matplotlib", "re", "collections"], "raises": ["ValueError: If the input text is empty."], "example": ["Examples:", ">>> ax = f_798(['Lorem ipsum', 'consectetur adipiscing'], 'Lorem ipsum dolor sit amet lorem Ipsum')", ">>> type(ax)", ""]}} -{"task_id": "f_848", "prompt": "import urllib.request\nimport zipfile\nimport os\nimport urllib.error\n\n\ndef f_848(\n url: str,\n save_path: str = \"downloaded_file.zip\",\n extract_path: str = \"extracted_files\",\n) -> str:\n \"\"\"\n Downloads, extracts, and deletes a ZIP file from a specified URL.\n\n The function includes comprehensive error handling to manage issues such as invalid URLs, unreachable servers, corrupted ZIP files, and file I/O errors. In the event of a failure, it provides a descriptive error message.\n\n Parameters:\n - url (str): The URL of the ZIP file to be downloaded.\n - save_path (str, optional): The local file path where the ZIP file will be saved temporarily. Defaults to 'downloaded_file.zip'.\n - extract_path (str, optional): The directory where the ZIP file's contents will be extracted. Defaults to 'extracted_files'.\n\n Returns:\n - str: The path to the directory where the ZIP file's contents have been extracted. Returns an error message in case of failure.\n\n Raises:\n - urllib.error.URLError: If the URL is invalid or the server cannot be reached. \n In this case, the function returns a string in the format \"URL Error: [error reason]\".\n\n Requirements:\n - urllib\n - zipfile\n - os\n - urllib\n\n Example:\n >>> extracted_path = f_848('http://www.example.com/data.zip')\n >>> print(extracted_path)\n 'extracted_files'\n\n\n \"\"\"", "canonical_solution": " try:\n # Check if save_path already exists, if so, remove it\n if os.path.exists(save_path):\n os.remove(save_path)\n\n # Download the file from the URL\n urllib.request.urlretrieve(url, save_path)\n\n # Create the extraction directory if it doesn't exist\n if not os.path.exists(extract_path):\n os.makedirs(extract_path)\n\n # Extract the zip file\n with zipfile.ZipFile(save_path, \"r\") as zip_ref:\n zip_ref.extractall(extract_path)\n\n # Remove the downloaded zip file\n os.remove(save_path)\n\n return extract_path\n except urllib.error.URLError as e:\n return f\"URL Error: {e.reason}\"", "test": "import unittest\nimport os\nimport urllib.error\nimport shutil\nfrom pathlib import Path\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_848 function.\"\"\"\n base_path = \"mnt/data/f_848_data_chien\"\n def setUp(self):\n # Ensure the base path is absolute\n self.base_path = os.path.abspath(self.base_path)\n # Create base directory for test data\n if not os.path.exists(self.base_path):\n os.makedirs(self.base_path)\n def test_successful_download_and_extraction_sample_1(self):\n \"\"\"Test Case 1: Successful Download and Extraction of Sample 1\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-1.zip\"\n save_path = Path(self.base_path) / \"sample_1_download.zip\"\n extract_path = Path(self.base_path) / \"sample_1_extract\"\n result_path = f_848(url, save_path, extract_path)\n self.assertEqual(result_path, extract_path)\n self.assertTrue(os.path.exists(extract_path))\n self.assertFalse(os.path.exists(save_path))\n def test_successful_download_and_extraction_sample_2(self):\n \"\"\"Test Case 2: Successful Download and Extraction of Sample 2\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-2.zip\"\n save_path = Path(self.base_path) / \"sample_2_download.zip\"\n extract_path = Path(self.base_path) / \"sample_2_extract\"\n result_path = f_848(url, save_path, extract_path)\n self.assertEqual(result_path, extract_path)\n self.assertTrue(os.path.exists(extract_path))\n self.assertFalse(os.path.exists(save_path))\n def test_invalid_url(self):\n \"\"\"Test Case 3: Invalid URL\"\"\"\n url = \"https://invalidurl.com/nonexistent.zip\"\n save_path = Path(self.base_path) / \"invalid_url.zip\"\n extract_path = Path(self.base_path) / \"invalid_url_extract\"\n result = f_848(url, save_path, extract_path)\n self.assertTrue(result.startswith(\"URL Error:\"))\n def test_file_already_exists_at_save_path(self):\n \"\"\"Test Case 4: File Already Exists at Save Path\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-1.zip\"\n save_path = Path(self.base_path) / \"existing_file.zip\"\n extract_path = Path(self.base_path) / \"existing_file_extract\"\n # Create a dummy file at the save path\n with open(save_path, \"w\") as file:\n file.write(\"Dummy content\")\n result_path = f_848(url, save_path, extract_path)\n self.assertEqual(result_path, extract_path)\n self.assertFalse(os.path.exists(save_path))\n def test_extraction_path_already_exists(self):\n \"\"\"Test Case 5: Extraction Path Already Exists\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-2.zip\"\n save_path = Path(self.base_path) / \"extract_path_exists.zip\"\n extract_path = Path(self.base_path) / \"existing_extract_path\"\n # Create the extraction path directory\n if not os.path.exists(extract_path):\n os.makedirs(extract_path)\n result_path = f_848(url, save_path, extract_path)\n self.assertEqual(result_path, extract_path)\n @classmethod\n def tearDownClass(cls):\n # Clean up any files or directories created during the tests\n shutil.rmtree(cls.base_path, ignore_errors=True)\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["os.path.exists", "urllib.request.urlretrieve", "zipfile.ZipFile", "os.path", "os.remove", "urllib.error", "os.makedirs", "urllib.request"], "libs": ["urllib", "zipfile", "os"], "doc": {"description": ["Downloads, extracts, and deletes a ZIP file from a specified URL.", "The function includes comprehensive error handling to manage issues such as invalid URLs, unreachable servers, corrupted ZIP files, and file I/O errors. In the event of a failure, it provides a descriptive error message."], "note": [], "params": ["url (str): The URL of the ZIP file to be downloaded.", "save_path (str, optional): The local file path where the ZIP file will be saved temporarily. Defaults to 'downloaded_file.zip'.", "extract_path (str, optional): The directory where the ZIP file's contents will be extracted. Defaults to 'extracted_files'."], "returns": ["str: The path to the directory where the ZIP file's contents have been extracted. Returns an error message in case of failure."], "reqs": ["urllib", "zipfile", "os", "urllib"], "raises": ["urllib.error.URLError: If the URL is invalid or the server cannot be reached.", "In this case, the function returns a string in the format \"URL Error: [error reason]\"."], "example": [">>> extracted_path = f_848('http://www.example.com/data.zip')", ">>> print(extracted_path)", "'extracted_files'"]}} -{"task_id": "f_812", "prompt": "import numpy as np\nfrom sklearn.preprocessing import MinMaxScaler\nimport pandas as pd\n\n\ndef f_812(df: pd.DataFrame) -> pd.DataFrame:\n \"\"\"\n Computes the MinMax-normalized cumulative sum for each numeric column in the given DataFrame.\n\n Parameters:\n - df (pandas.DataFrame): The input DataFrame containing numerical values.\n\n Returns:\n - pd.DataFrame: A DataFrame where each column contains the normalized cumulative sum of the\n respective column in the input DataFrame, retaining the original column names.\n\n Raises:\n - TypeError: If the DataFrame contains non-numeric data types.\n - ValueError: If the DataFrame is empty or contains NaN values.\n\n Requirements:\n - pandas\n - numpy\n - sklearn\n\n Example:\n >>> input_df = pd.DataFrame({'A': [1, 2, 3], 'B': [3, 2, 1]})\n >>> output_df = f_812(input_df)\n >>> type(output_df)\n \n >>> output_df\n A B\n 0 0.0 0.000000\n 1 0.4 0.666667\n 2 1.0 1.000000\n \"\"\"", "canonical_solution": " if df.select_dtypes(include=np.number).shape[1] != df.shape[1]:\n raise TypeError(\"Input DataFrame contains non-numeric data types.\")\n if df.empty or df.isnull().values.any():\n raise ValueError(\"Input DataFrame is empty or contains NaN values.\")\n\n df_cumsum = df.cumsum()\n scaler = MinMaxScaler()\n df_norm_cumsum = pd.DataFrame(scaler.fit_transform(df_cumsum), columns=df.columns)\n\n return df_norm_cumsum", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def check_cumsum_and_scaling(self, input_df, expected_output):\n output = f_812(input_df)\n pd.testing.assert_frame_equal(\n output, expected_output, check_dtype=False, atol=1e-5\n )\n def test_incremental_values(self):\n before = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [3, 2, 1]})\n after = pd.DataFrame({\"A\": [0.0, 0.4, 1.0], \"B\": [0.0, 0.66666667, 1.0]})\n self.check_cumsum_and_scaling(before, after)\n self.assertEqual(set(before.columns), set(after.columns))\n def test_negative_numbers(self):\n before = pd.DataFrame({\"A\": [-1, -2, -3], \"B\": [-3, -2, -1]})\n after = pd.DataFrame({\"A\": [1.0, 0.6, 0.0], \"B\": [1.0, 0.33333333, 0.0]})\n self.check_cumsum_and_scaling(before, after)\n self.assertEqual(set(before.columns), set(after.columns))\n def test_all_zeros(self):\n before = pd.DataFrame({\"A\": [0, 0, 0], \"B\": [0, 0, 0]})\n after = pd.DataFrame({\"A\": [0.0, 0.0, 0.0], \"B\": [0.0, 0.0, 0.0]})\n self.check_cumsum_and_scaling(before, after)\n self.assertEqual(set(before.columns), set(after.columns))\n def test_same_numbers(self):\n before = pd.DataFrame({\"A\": [5, 5, 5], \"B\": [2, 2, 2]})\n after = pd.DataFrame({\"A\": [0.0, 0.5, 1.0], \"B\": [0.0, 0.5, 1.0]})\n self.check_cumsum_and_scaling(before, after)\n self.assertEqual(set(before.columns), set(after.columns))\n def test_non_numeric_data_raises(self):\n with self.assertRaises(TypeError):\n f_812(pd.DataFrame({\"A\": [\"one\", \"two\", \"three\"], \"B\": [1, 2, 3]}))\n def test_nan_values_raise(self):\n with self.assertRaises(ValueError):\n f_812(pd.DataFrame({\"A\": [1, np.nan, 3], \"B\": [3, 2, 1]}))\n def test_empty_dataframe(self):\n with self.assertRaises(ValueError):\n f_812(pd.DataFrame())", "apis": ["sklearn.preprocessing.MinMaxScaler", "pandas.DataFrame", "numpy.number"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Computes the MinMax-normalized cumulative sum for each numeric column in the given DataFrame."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame containing numerical values."], "returns": ["pd.DataFrame: A DataFrame where each column contains the normalized cumulative sum of the", "respective column in the input DataFrame, retaining the original column names."], "reqs": ["pandas", "numpy", "sklearn"], "raises": ["TypeError: If the DataFrame contains non-numeric data types.", "ValueError: If the DataFrame is empty or contains NaN values."], "example": [">>> input_df = pd.DataFrame({'A': [1, 2, 3], 'B': [3, 2, 1]})", ">>> output_df = f_812(input_df)", ">>> type(output_df)", "", ">>> output_df", "A B", "0 0.0 0.000000", "1 0.4 0.666667", "2 1.0 1.000000"]}} -{"task_id": "f_742", "prompt": "import pandas as pd\nimport numpy as np\n\ndef f_742(d):\n \"\"\"\n Calculate mean, sum, max, min and standard deviation for the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d.\"\n \n Parameters:\n d (list): A list of dictionaries.\n\n Returns:\n dict: A dictionary with keys as 'x', 'y', and 'z' and values as dictionaries of statistics.\n\n Raises:\n - ValueError: If input is not a list of dictionaries.\n\n Requirements:\n - pandas\n - numpy\n\n Examples:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n >>> f_742(data)\n {'x': {'mean': 2.0, 'sum': 6, 'max': 3, 'min': 1, 'std': 0.816496580927726}, 'y': {'mean': 8.666666666666666, 'sum': 26, 'max': 15, 'min': 1, 'std': 5.792715732327589}, 'z': {'mean': 6.0, 'sum': 18, 'max': 7, 'min': 5, 'std': 0.816496580927726}}\n >>> f_742([])\n {'x': None, 'y': None, 'z': None}\n >>> f_742([{'a': 1}])\n {'x': None, 'y': None, 'z': None}\n \"\"\"", "canonical_solution": " if not isinstance(d, list) or any(not isinstance(item, dict) for item in d):\n raise ValueError(\"Input must be a list of dictionaries.\")\n \n if not d:\n return {key: None for key in ['x', 'y', 'z']}\n\n df = pd.DataFrame(d).fillna(0) # Replace missing values with 0 to allow computations\n stats = {}\n\n for key in ['x', 'y', 'z']:\n if key in df.columns:\n stats[key] = {\n 'mean': np.mean(df[key]),\n 'sum': np.sum(df[key]),\n 'max': np.max(df[key]),\n 'min': np.min(df[key]),\n 'std': np.std(df[key], ddof=0) # Population standard deviation\n }\n else:\n stats[key] = None\n\n return stats", "test": "# Test suite\nimport unittest\nclass TestCases(unittest.TestCase):\n def test_empty_list(self):\n self.assertEqual(f_742([]), {'x': None, 'y': None, 'z': None})\n def test_valid_input(self):\n data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n result = f_742(data)\n self.assertAlmostEqual(result['x']['mean'], 2.0)\n self.assertAlmostEqual(result['y']['mean'], 8.666666666666666)\n self.assertAlmostEqual(result['z']['mean'], 6.0)\n def test_invalid_input_type(self):\n with self.assertRaises(ValueError):\n f_742(\"not a list\")\n def test_partial_keys(self):\n data = [{'x': 1, 'y': 2}, {'y': 3, 'z': 4}]\n result = f_742(data)\n self.assertIsNotNone(result['x'])\n self.assertIsNotNone(result['y'])\n self.assertIsNotNone(result['z'])\n def test_all_keys_missing(self):\n data = [{'a': 1}, {'b': 2}]\n self.assertEqual(f_742(data), {'x': None, 'y': None, 'z': None})", "apis": ["numpy.mean", "numpy.std", "numpy.min", "numpy.max", "numpy.sum", "pandas.DataFrame"], "libs": ["numpy", "pandas"], "doc": {"description": ["Calculate mean, sum, max, min and standard deviation for the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d.\""], "note": [], "params": ["d (list): A list of dictionaries."], "returns": ["dict: A dictionary with keys as 'x', 'y', and 'z' and values as dictionaries of statistics."], "reqs": ["pandas", "numpy"], "raises": ["ValueError: If input is not a list of dictionaries."], "example": ["Examples:", ">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]", ">>> f_742(data)", "{'x': {'mean': 2.0, 'sum': 6, 'max': 3, 'min': 1, 'std': 0.816496580927726}, 'y': {'mean': 8.666666666666666, 'sum': 26, 'max': 15, 'min': 1, 'std': 5.792715732327589}, 'z': {'mean': 6.0, 'sum': 18, 'max': 7, 'min': 5, 'std': 0.816496580927726}}", ">>> f_742([])", "{'x': None, 'y': None, 'z': None}", ">>> f_742([{'a': 1}])", "{'x': None, 'y': None, 'z': None}"]}} -{"task_id": "f_582", "prompt": "import pandas as pd\nfrom sklearn.cluster import KMeans\n\ndef f_582(x_list, y_list):\n \"\"\"\n Perform K-Means clustering on the given data by first turning it into a DataFrame with two columns \"x\" and \"y\" and then return the labels and centroids.\n\n Parameters:\n - x_list (list): List of data corresponding to 'x'\n - y_list (list): List of data corresponding to 'y'\n\n Returns:\n tuple: The labels and centroids as numpy arrays.\n - kmeans.labels_: A NumPy array where each element is the cluster label assigned to each data point. \n - kmeans.cluster_centers_: A NumPy array containing the coordinates of the cluster centers.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [2, 3, 4, 5, 6, 7]})\n >>> labels, centroids = f_582([1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7])\n \"\"\"", "canonical_solution": " df = pd.DataFrame({'x': x_list, 'y': y_list})\n kmeans = KMeans(n_clusters=2, random_state=0).fit(df)\n return kmeans.labels_, kmeans.cluster_centers_", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n labels, centroids = f_582([1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n self.assertEqual(labels[2], 0)\n self.assertEqual(labels[3], 1)\n self.assertEqual(labels[4], 1)\n self.assertEqual(labels[5], 1)\n self.assertEqual(centroids[0][0], 2.)\n self.assertEqual(centroids[0][1], 3.)\n self.assertEqual(centroids[1][0], 5.)\n self.assertEqual(centroids[1][1], 6.)\n def test_case_2(self):\n labels, centroids = f_582([1, 1, 1, 1, 1, 1], [2, 2, 2, 2, 2, 2])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n self.assertEqual(labels[2], 0)\n self.assertEqual(labels[3], 0)\n self.assertEqual(labels[4], 0)\n self.assertEqual(labels[5], 0)\n self.assertEqual(centroids[0][0], 1.)\n self.assertEqual(centroids[0][1], 2.)\n def test_case_3(self):\n labels, centroids = f_582([1, 2, 3, 4, 5, 6], [2, 2, 2, 2, 2, 2])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n self.assertEqual(labels[2], 0)\n self.assertEqual(labels[3], 1)\n self.assertEqual(labels[4], 1)\n self.assertEqual(labels[5], 1)\n self.assertEqual(centroids[0][0], 2.)\n self.assertEqual(centroids[0][1], 2.)\n self.assertEqual(centroids[1][0], 5.)\n self.assertEqual(centroids[1][1], 2.)\n def test_case_4(self):\n labels, centroids = f_582([0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n def test_case_5(self):\n labels, centroids = f_582([1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n self.assertEqual(labels[2], 0)\n self.assertEqual(labels[3], 1)\n self.assertEqual(labels[4], 1)\n self.assertEqual(labels[5], 1)\n self.assertEqual(centroids[0][0], 2.)\n self.assertEqual(centroids[0][1], 2.)\n self.assertEqual(centroids[1][0], 5.)\n self.assertEqual(centroids[1][1], 5.)", "apis": ["sklearn.cluster.KMeans", "pandas.DataFrame"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Perform K-Means clustering on the given data by first turning it into a DataFrame with two columns \"x\" and \"y\" and then return the labels and centroids."], "note": [], "params": ["x_list (list): List of data corresponding to 'x'", "y_list (list): List of data corresponding to 'y'"], "returns": ["tuple: The labels and centroids as numpy arrays.", "kmeans.labels_: A NumPy array where each element is the cluster label assigned to each data point.", "kmeans.cluster_centers_: A NumPy array containing the coordinates of the cluster centers."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [2, 3, 4, 5, 6, 7]})", ">>> labels, centroids = f_582([1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7])"]}} -{"task_id": "f_412", "prompt": "from collections import defaultdict\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_412(data):\n \"\"\"\n Calculate statistical measurements (mean and standard deviation) of the values associated with\n each key in a list of dictionaries and visualize them with bar charts.\n\n Parameters:\n data (list): The list of dictionaries. Must not be empty. Each dictionary must have numeric values.\n The function raises ValueError if the input list is empty and TypeError if the input is not a\n list of dictionaries or contains non-numeric values.\n\n Returns:\n tuple:\n - dict: A dictionary with keys and their corresponding mean and standard deviation.\n - list: A list of matplotlib Axes objects for each key's visualization.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - collections.defaultdict\n \n Example:\n >>> stats, axes = f_412([{'cat': 1, 'dog': 3}, {'cat' : 2, 'dog': 5}, {'cat' : 3, 'dog': 7}])\n >>> stats\n {'cat': {'mean': 2.0, 'std': 0.816496580927726}, 'dog': {'mean': 5.0, 'std': 1.632993161855452}}\n >>> axes\n [, ]\n \"\"\"", "canonical_solution": " if not data:\n raise ValueError(\"Input data is empty.\")\n if not isinstance(data, list) or not all(isinstance(d, dict) for d in data):\n raise TypeError(\"Input must be a list of dictionaries.\")\n for d in data:\n if not all(isinstance(value, (int, float)) for value in d.values()):\n raise TypeError(\"All values in the dictionaries must be numeric.\")\n\n stats = defaultdict(list)\n for d in data:\n for key, value in d.items():\n stats[key].append(value)\n\n result = {k: {\"mean\": np.mean(v), \"std\": np.std(v)} for k, v in stats.items()}\n\n # Visualization\n axes = []\n for key in result:\n fig, ax = plt.subplots()\n ax.bar(x=[\"mean\", \"std\"], height=result[key].values())\n ax.set_title(f\"Statistics of {key}\")\n ax.set_ylabel(\"Value\")\n axes.append(ax)\n\n return result, axes", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n data = [{\"cat\": 1, \"dog\": 3}, {\"cat\": 2, \"dog\": 5}, {\"cat\": 3, \"dog\": 7}]\n stats, axes = f_412(data)\n self.assertEqual(\n stats,\n {\n \"cat\": {\"mean\": 2.0, \"std\": 0.816496580927726},\n \"dog\": {\"mean\": 5.0, \"std\": 1.632993161855452},\n },\n )\n self.assertEqual(axes[0].get_title(), \"Statistics of cat\")\n self.assertEqual(axes[1].get_title(), \"Statistics of dog\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_2(self):\n # Test other keys (animals)\n data = [{\"bird\": 5, \"fish\": 10}, {\"bird\": 6, \"fish\": 8}, {\"bird\": 7, \"fish\": 9}]\n stats, axes = f_412(data)\n self.assertEqual(\n stats,\n {\n \"bird\": {\"mean\": 6.0, \"std\": 0.816496580927726},\n \"fish\": {\"mean\": 9.0, \"std\": 0.816496580927726},\n },\n )\n self.assertEqual(axes[0].get_title(), \"Statistics of bird\")\n self.assertEqual(axes[1].get_title(), \"Statistics of fish\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_3(self):\n # Test handling negatives\n data = [{\"cat\": -1, \"dog\": -3}, {\"cat\": -2, \"dog\": -5}, {\"cat\": -3, \"dog\": -7}]\n stats, axes = f_412(data)\n self.assertEqual(\n stats,\n {\n \"cat\": {\"mean\": -2.0, \"std\": 0.816496580927726},\n \"dog\": {\"mean\": -5.0, \"std\": 1.632993161855452},\n },\n )\n self.assertEqual(axes[0].get_title(), \"Statistics of cat\")\n self.assertEqual(axes[1].get_title(), \"Statistics of dog\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_4(self):\n # Test single input\n data = [{\"cat\": 1}]\n stats, axes = f_412(data)\n self.assertEqual(stats, {\"cat\": {\"mean\": 1.0, \"std\": 0.0}})\n self.assertEqual(axes[0].get_title(), \"Statistics of cat\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_5(self):\n # Test handling zero\n data = [{\"cat\": 0, \"dog\": 0}, {\"cat\": 0, \"dog\": 0}, {\"cat\": 0, \"dog\": 0}]\n stats, axes = f_412(data)\n self.assertEqual(\n stats, {\"cat\": {\"mean\": 0.0, \"std\": 0.0}, \"dog\": {\"mean\": 0.0, \"std\": 0.0}}\n )\n self.assertEqual(axes[0].get_title(), \"Statistics of cat\")\n self.assertEqual(axes[1].get_title(), \"Statistics of dog\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_6(self):\n # Test correct handling of empty input\n with self.assertRaises(ValueError):\n f_412([])\n def test_case_7(self):\n # Test correct handling of incorrect input types\n with self.assertRaises(TypeError):\n f_412(\"not a list\")\n with self.assertRaises(TypeError):\n f_412([123])\n with self.assertRaises(TypeError):\n f_412([{\"cat\": \"not numeric\"}])\n def test_case_8(self):\n # Test with a mix of positive and negative integers\n data = [\n {\"apple\": -2, \"banana\": 4},\n {\"apple\": -4, \"banana\": 6},\n {\"apple\": -6, \"banana\": 8},\n ]\n stats, _ = f_412(data)\n self.assertEqual(\n stats,\n {\n \"apple\": {\"mean\": -4.0, \"std\": 1.632993161855452},\n \"banana\": {\"mean\": 6.0, \"std\": 1.632993161855452},\n },\n )\n def test_case_9(self):\n # Test with floating point numbers\n data = [{\"x\": 0.5, \"y\": 1.5}, {\"x\": 2.5, \"y\": 3.5}, {\"x\": 4.5, \"y\": 5.5}]\n stats, _ = f_412(data)\n self.assertEqual(\n stats,\n {\n \"x\": {\"mean\": 2.5, \"std\": 1.632993161855452},\n \"y\": {\"mean\": 3.5, \"std\": 1.632993161855452},\n },\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["collections.defaultdict", "numpy.std", "matplotlib.pyplot.subplots", "numpy.mean"], "libs": ["numpy", "collections", "matplotlib"], "doc": {"description": ["Calculate statistical measurements (mean and standard deviation) of the values associated with", "each key in a list of dictionaries and visualize them with bar charts."], "note": [], "params": ["data (list): The list of dictionaries. Must not be empty. Each dictionary must have numeric values.", "The function raises ValueError if the input list is empty and TypeError if the input is not a", "list of dictionaries or contains non-numeric values."], "returns": ["tuple:", "dict: A dictionary with keys and their corresponding mean and standard deviation.", "list: A list of matplotlib Axes objects for each key's visualization."], "reqs": ["numpy", "matplotlib.pyplot", "collections.defaultdict"], "raises": [], "example": [">>> stats, axes = f_412([{'cat': 1, 'dog': 3}, {'cat' : 2, 'dog': 5}, {'cat' : 3, 'dog': 7}])", ">>> stats", "{'cat': {'mean': 2.0, 'std': 0.816496580927726}, 'dog': {'mean': 5.0, 'std': 1.632993161855452}}", ">>> axes", "[, ]"]}} +{"task_id": "f_346", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_346(P, T):\n \"\"\"\n Calculate the product of a matrix \"P\" and a 3D tensor \"T\" with numpy and then visualize the\n result in 3D with matplotlib. Note: This function only accepts numpy matrices/arrays.\n\n Parameters:\n P (numpy.ndarray): The input matrix with shape (N, 3), where N is the number of rows.\n T (numpy.ndarray): The input tensor with shape (3, 3, 3).\n\n Returns:\n tuple:\n - result (numpy.ndarray): The product of matrix P and tensor T with shape (N, 3).\n - ax (mpl_toolkits.mplot3d.axes3d.Axes3D): The 3D visualization of the result.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1]])\n >>> T = np.random.rand(3, 3, 3)\n >>> result, ax = f_346(P, T)\n >>> type(result)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if not (isinstance(P, np.ndarray) and isinstance(T, np.ndarray)):\n raise TypeError(\"Expected inputs to be numpy arrays\")\n\n # Compute the matrix-tensor product to ensure the result has the desired shape\n result = np.einsum(\"ij,jkl->ik\", P, T)\n\n # Visualize the result in 3D\n fig = plt.figure()\n ax = fig.add_subplot(111, projection=\"3d\")\n ax.scatter(result[:, 0], result[:, 1], result[:, 2])\n\n # Return the result and the 3D visualization\n return result, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n self.test_P = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n self.test_T = np.random.rand(3, 3, 3)\n def check_result_correctness(self, P, T, result):\n # Manually compute the expected result for the matrix-tensor product\n expected_result = np.einsum(\"ij,jkl->ik\", P, T)\n return np.allclose(result, expected_result)\n def test_case_1(self):\n # Test output visualization\n _, ax = f_346(self.test_P, self.test_T)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"\")\n self.assertEqual(ax.get_xlabel(), \"\")\n self.assertEqual(ax.get_ylabel(), \"\")\n ax.set_title(\"Test Title\")\n ax.set_xlabel(\"X Label\")\n ax.set_ylabel(\"Y Label\")\n self.assertEqual(ax.get_title(), \"Test Title\")\n self.assertEqual(ax.get_xlabel(), \"X Label\")\n self.assertEqual(ax.get_ylabel(), \"Y Label\")\n def test_case_2(self):\n # Test result correctness\n result, _ = f_346(self.test_P, self.test_T)\n self.assertTrue(self.check_result_correctness(self.test_P, self.test_T, result))\n self.assertEqual(result.shape, (self.test_P.shape[0], 3))\n def test_case_3(self):\n # Test with zeros and negative values\n P = np.array([[0, 0, 0]])\n T = np.random.rand(3, 3, 3) - 0.5\n result, _ = f_346(P, T)\n self.assertTrue(np.all(result == 0))\n def test_case_4(self):\n # Test with non-numeric data\n P = np.array([[\"a\", \"b\", \"c\"], [1, 2, 3]])\n with self.assertRaises(Exception):\n f_346(P, self.test_T)\n def test_case_5(self):\n # Test incompatible shapes\n P = np.array([[1, 2], [3, 4]])\n with self.assertRaises(Exception):\n f_346(P, self.test_T)\n def test_case_6(self):\n # Test incompatible input types\n with self.assertRaises(Exception):\n f_346([1, 2], [2, 1])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.figure", "numpy.einsum", "numpy.ndarray"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Calculate the product of a matrix \"P\" and a 3D tensor \"T\" with numpy and then visualize the", "result in 3D with matplotlib. Note: This function only accepts numpy matrices/arrays."], "note": [], "params": ["P (numpy.ndarray): The input matrix with shape (N, 3), where N is the number of rows.", "T (numpy.ndarray): The input tensor with shape (3, 3, 3)."], "returns": ["tuple:", "result (numpy.ndarray): The product of matrix P and tensor T with shape (N, 3).", "ax (mpl_toolkits.mplot3d.axes3d.Axes3D): The 3D visualization of the result."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1]])", ">>> T = np.random.rand(3, 3, 3)", ">>> result, ax = f_346(P, T)", ">>> type(result)", "", ">>> type(ax)", ""]}} +{"task_id": "f_359", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_359(L):\n \"\"\"\n Draw a histogram of all elements in a nested list 'L' and return the Axes object of the plot.\n\n The function first uses Numpy to handle array operations, checking for correct input type\n while ignoring empty sublists. It then plots the histogram using pandas, assigning\n each unique value its own bin and plotting the histogram with rwidth 0.8.\n\n Parameters:\n L (list of list of int): Nested list of integers.\n\n Returns:\n ax (matplotlib.axes._axes.Axes): The Axes object of the histogram plot.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> ax = f_359([[1,2,3],[4,5,6]])\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5'), Text(6.0, 0, '6'), Text(7.0, 0, '7')]\n \"\"\"", "canonical_solution": "\n flattened = np.concatenate([l for l in L if l])\n if not np.issubdtype(flattened.dtype, np.integer):\n raise TypeError(\"Expected list of list of int\")\n bins = len(np.unique(flattened))\n ax = pd.Series(flattened).plot(kind=\"hist\", rwidth=0.8, bins=bins)\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test non-overlapping numbers split into multi-item lists\n ax = f_359([[1, 2, 3], [4, 5, 6]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 12)\n def test_case_2(self):\n # Test non-overlapping numbers in individual lists\n ax = f_359([[1], [2], [3], [4], [5], [6]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 6)\n def test_case_3(self):\n # Test overlapping numbers split into multi-item lists\n ax = f_359([[1, 1], [2, 2], [3, 3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_4(self):\n # Test overlapping numbers that repeat across items\n ax = f_359([[1, 2], [1, 3], [2, 3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_5(self):\n # Test overlapping numbers in individual lists\n ax = f_359([[1], [1], [2], [2], [3], [3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_6(self):\n # Test case with uneven segment sizes\n ax = f_359([[10, 20, 30], [40]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 4)\n def test_case_7(self):\n # Test negative integers\n ax = f_359([[-1, -2], [-2, -3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_8(self):\n # Test larger integers\n ax = f_359([[10000, 20000], [30000]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_9(self):\n # Test single element\n ax = f_359([[1]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 1)\n def test_case_10(self):\n # Test handling mix of valid sublists and empty ones\n ax = f_359([[], [1, 2], [], [3, 4], []])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 4)\n def test_case_11(self):\n # Test handling NumPy array conversion\n ax = f_359([[np.int64(1)], [np.int32(2)], [3]])\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(len(ax.patches), 3)\n def test_case_12(self):\n # Test handling invalid input - fully empty lists, excessive nesting\n with self.assertRaises(ValueError):\n f_359([[], [], []])\n with self.assertRaises(ValueError):\n f_359([[[1]], [2], [3]])\n def test_case_13(self):\n # Test handling invalid input - non-int types\n with self.assertRaises(TypeError):\n f_359([1.1, 2.2], [3.3])\n with self.assertRaises(TypeError):\n f_359([\"1\", \"2\"], [\"3\", \"4\"])\n with self.assertRaises(TypeError):\n f_359([[1, 2], [\"a\", \"b\"]])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.Series", "numpy.integer", "numpy.unique", "numpy.concatenate", "numpy.issubdtype"], "libs": ["numpy", "pandas"], "doc": {"description": ["Draw a histogram of all elements in a nested list 'L' and return the Axes object of the plot.", "The function first uses Numpy to handle array operations, checking for correct input type", "while ignoring empty sublists. It then plots the histogram using pandas, assigning", "each unique value its own bin and plotting the histogram with rwidth 0.8."], "note": [], "params": ["L (list of list of int): Nested list of integers."], "returns": ["ax (matplotlib.axes._axes.Axes): The Axes object of the histogram plot."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> ax = f_359([[1,2,3],[4,5,6]])", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0.0, 0, '0'), Text(1.0, 0, '1'), Text(2.0, 0, '2'), Text(3.0, 0, '3'), Text(4.0, 0, '4'), Text(5.0, 0, '5'), Text(6.0, 0, '6'), Text(7.0, 0, '7')]"]}} +{"task_id": "f_925", "prompt": "import pandas as pd\nimport seaborn as sns\n\n\ndef f_925(data=None):\n \"\"\"\n Converts string-formatted weights to floats and plots a scatter plot of weight against height.\n\n This function takes a dictionary with two keys: 'Weight_String' and 'Height'. The 'Weight_String' key should \n contain a list of weight values in string format, while the 'Height' key should have a list of corresponding \n height values in numerical format. If the input dictionary is not provided, the function uses a default dataset.\n The function then converts the string-formatted weights into float, and plots a scatter plot to visualize \n the relationship between weight and height.\n \n Parameters:\n - data (dict, optional): A dictionary with keys 'Weight_String' and 'Height'. 'Weight_String' is expected to be \n a list of weight values in string format (e.g., ['60.5', '65.7']), and 'Height' is expected \n to be a list of corresponding numerical height values (e.g., [160, 165]). If no dictionary \n is provided, a default dataset with predetermined values is used.\n Default dictionary:\n {\n 'Weight_String': ['60.5', '65.7', '70.2', '75.9', '80.1'],\n 'Height': [160, 165, 170, 175, 180]\n }\n\n Returns:\n - ax (matplotlib.axes._subplots.Axes): A scatter plot with weight on the x-axis and height on the y-axis, titled \"Weight vs Height\".\n\n Raises:\n - ValueError: If any of the values in the 'Weight_String' key are not formatted as strings. This validation ensures \n that the weight data is in the expected format for conversion to float.\n\n Requirements:\n - pandas\n - seaborn\n\n Example:\n >>> ax = f_925()\n >>> print(ax.get_title())\n Weight vs Height\n \"\"\"", "canonical_solution": " if data is None:\n data = {\n \"Weight_String\": [\"60.5\", \"65.7\", \"70.2\", \"75.9\", \"80.1\"],\n \"Height\": [160, 165, 170, 175, 180],\n }\n\n df = pd.DataFrame(data)\n\n # Validate weight values are strings\n if not all(isinstance(weight, str) for weight in df[\"Weight_String\"]):\n raise ValueError(\"Weights must be provided as strings.\")\n\n # Convert string weights to floats\n df[\"Weight_Float\"] = df[\"Weight_String\"].astype(float)\n\n # Plotting the scatter plot\n ax = sns.scatterplot(data=df, x=\"Weight_Float\", y=\"Height\")\n ax.set_title(\"Weight vs Height\")\n return ax", "test": "import unittest\nimport pandas as pd\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_925\"\"\"\n def test_default_data(self):\n \"\"\"Test f_925 with its default data.\"\"\"\n result = f_925()\n self.assertIsInstance(result, Axes)\n def test_custom_data(self):\n \"\"\"Test f_925 with custom data.\"\"\"\n custom_data = {\n \"Weight_String\": [\"50.5\", \"55.7\", \"60.2\"],\n \"Height\": [150, 155, 160],\n }\n result = f_925(custom_data)\n self.assertIsInstance(result, Axes)\n def test_incorrect_data_type(self):\n \"\"\"Test f_925 with incorrect data types in Weight_String.\"\"\"\n incorrect_data = {\n \"Weight_String\": [\n 60.5,\n 65.7,\n 70.2,\n ], # Intentionally using floats instead of strings\n \"Height\": [160, 165, 170],\n }\n with self.assertRaises(ValueError):\n f_925(incorrect_data)\n def test_empty_data(self):\n \"\"\"Test f_925 with empty data.\"\"\"\n empty_data = {\"Weight_String\": [], \"Height\": []}\n result = f_925(empty_data)\n self.assertIsInstance(result, Axes)\n def test_mismatched_data_length(self):\n \"\"\"Test f_925 with mismatched lengths of Weight_String and Height.\"\"\"\n mismatched_data = {\n \"Weight_String\": [\"60.5\", \"65.7\"], # Less weights than heights\n \"Height\": [160, 165, 170],\n }\n with self.assertRaises(ValueError):\n f_925(mismatched_data)", "apis": ["pandas.DataFrame", "seaborn.scatterplot"], "libs": ["seaborn", "pandas"], "doc": {"description": ["Converts string-formatted weights to floats and plots a scatter plot of weight against height.", "This function takes a dictionary with two keys: 'Weight_String' and 'Height'. The 'Weight_String' key should", "contain a list of weight values in string format, while the 'Height' key should have a list of corresponding", "height values in numerical format. If the input dictionary is not provided, the function uses a default dataset.", "The function then converts the string-formatted weights into float, and plots a scatter plot to visualize", "the relationship between weight and height."], "note": [], "params": ["data (dict, optional): A dictionary with keys 'Weight_String' and 'Height'. 'Weight_String' is expected to be", "a list of weight values in string format (e.g., ['60.5', '65.7']), and 'Height' is expected", "to be a list of corresponding numerical height values (e.g., [160, 165]). If no dictionary", "is provided, a default dataset with predetermined values is used.", "Default dictionary:", "{", "'Weight_String': ['60.5', '65.7', '70.2', '75.9', '80.1'],", "'Height': [160, 165, 170, 175, 180]", "}"], "returns": ["ax (matplotlib.axes._subplots.Axes): A scatter plot with weight on the x-axis and height on the y-axis, titled \"Weight vs Height\"."], "reqs": ["pandas", "seaborn"], "raises": ["ValueError: If any of the values in the 'Weight_String' key are not formatted as strings. This validation ensures", "that the weight data is in the expected format for conversion to float."], "example": [">>> ax = f_925()", ">>> print(ax.get_title())", "Weight vs Height"]}} +{"task_id": "f_774", "prompt": "from collections import Counter\nimport re\n\ndef f_774(word: str) -> list:\n \"\"\"\n Finds the most common two-letter combination in a given, cleaned word (lowercased and alphabetic characters only) \n and returns its frequency. The search is case-insensitive and ignores non-alphabetic characters.\n \n Requirements:\n - collections.Counter\n - re\n \n Parameters:\n - word (str): The input string containing the word to analyze. The word should have a length of at least 2 to form pairs.\n \n Returns:\n - list: A list containing a single tuple. The tuple consists of the most frequent two-letter combination (str) \n and its frequency (int). Returns an empty list if the word has fewer than 2 letters, or after cleaning, \n the word has fewer than 2 alphabetic characters.\n \n Examples:\n >>> f_774(\"aaBBcc\")\n [('aa', 1)]\n >>> f_774(\"abc!abc\")\n [('ab', 2)]\n >>> f_774(\"a\")\n []\n >>> f_774(\"abcd\")\n [('ab', 1)]\n >>> f_774(\"a1b2c3\")\n [('ab', 1)]\n \"\"\"", "canonical_solution": " # Clean the word: lowercase and keep alphabetic characters only\n clean_word = re.sub('[^a-z]', '', word.lower())\n \n if len(clean_word) < 2:\n return []\n \n pairs = [clean_word[i:i+2] for i in range(len(clean_word) - 1)]\n pair_counter = Counter(pairs)\n most_common = pair_counter.most_common(1)\n \n # This check ensures we return the result directly from most_common without additional filtering\n return most_common", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_repeating_pairs(self):\n self.assertEqual(f_774(\"aabbcc\"), [('aa', 1)], \"Should identify single repeating pair\")\n \n def test_mixed_repeating_pairs(self):\n self.assertEqual(f_774(\"abcabc\"), [('ab', 2)], \"Should identify most frequent pair in mixed sequence\")\n \n def test_single_character(self):\n self.assertEqual(f_774(\"a\"), [], \"Should return empty list for single character\")\n \n def test_unique_pairs(self):\n self.assertEqual(f_774(\"abcdef\"), [('ab', 1)], \"Should handle all unique pairs\")\n \n def test_empty_string(self):\n self.assertEqual(f_774(\"\"), [], \"Should return empty list for empty string\")\n def test_case_insensitive(self):\n # Corrected the expected count to match the correct behavior of the function\n self.assertEqual(f_774(\"aAaAbbBB\"), [('aa', 3)], \"Should be case-insensitive\")\n def test_ignore_non_alphabetic(self):\n self.assertEqual(f_774(\"abc123abc!\"), [('ab', 2)], \"Should ignore non-alphabetic characters\")", "apis": ["re.sub", "collections.Counter"], "libs": ["re", "collections"], "doc": {"description": ["Finds the most common two-letter combination in a given, cleaned word (lowercased and alphabetic characters only)", "and returns its frequency. The search is case-insensitive and ignores non-alphabetic characters."], "note": [], "params": ["word (str): The input string containing the word to analyze. The word should have a length of at least 2 to form pairs."], "returns": ["list: A list containing a single tuple. The tuple consists of the most frequent two-letter combination (str)", "and its frequency (int). Returns an empty list if the word has fewer than 2 letters, or after cleaning,", "the word has fewer than 2 alphabetic characters."], "reqs": ["collections.Counter", "re"], "raises": [], "example": ["Examples:", ">>> f_774(\"aaBBcc\")", "[('aa', 1)]", ">>> f_774(\"abc!abc\")", "[('ab', 2)]", ">>> f_774(\"a\")", "[]", ">>> f_774(\"abcd\")", "[('ab', 1)]", ">>> f_774(\"a1b2c3\")", "[('ab', 1)]"]}} +{"task_id": "f_859", "prompt": "import requests\nfrom PIL import Image\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_859(url: str) -> \"matplotlib.axes._axes.Axes\":\n \"\"\"\n Downloads an image from the specified URL, converts it to grayscale, and generates a histogram of its grayscale values.\n\n Parameters:\n - url (str): The URL of the image to be downloaded. Must be a valid URL pointing to an image.\n\n Returns:\n - matplotlib.axes._axes.Axes: The Axes object of the generated histogram.\n\n Raises:\n - ValueError: If the URL is invalid or if there's an error downloading the image. Error message will specify the download issue.\n - IOError: If there's an error in opening or processing the downloaded image. Error message will specify the processing issue.\n\n Requirements:\n - requests\n - PIL\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_859(\"https://www.example.com/myimage.jpg\")\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " response = None # Initialize response to None\n # Validate the URL\n if not isinstance(url, str) or not url:\n raise ValueError(\"Invalid URL provided.\")\n\n # Download the image with error handling\n try:\n response = requests.get(url, stream=True, timeout=10)\n response.raise_for_status()\n img = Image.open(response.raw).convert(\"L\")\n except requests.RequestException as e:\n raise ValueError(f\"Error downloading the image: {e}\") from e\n except IOError as e:\n raise IOError(f\"Error processing the image: {e}\") from e\n finally:\n if response: # Check if response is not None before closing\n response.close()\n\n # Convert the image to a numpy array\n img_array = np.array(img)\n\n # Create the histogram and return the Axes object\n _, ax = plt.subplots()\n ax.hist(img_array.ravel(), bins=256, color=\"gray\", alpha=0.7)\n ax.set_title(\"Grayscale Histogram\")\n return ax", "test": "import unittest\nfrom unittest.mock import patch, MagicMock, Mock\nimport requests\nimport matplotlib\nfrom PIL import Image\nimport io\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_859.\"\"\"\n def create_mock_image(self):\n \"\"\"\n Creates a mock grayscale image in memory.\n \"\"\"\n img = Image.new(\"L\", (100, 100), color=\"gray\")\n img_byte_arr = io.BytesIO()\n img.save(img_byte_arr, format=\"JPEG\")\n img_byte_arr.seek(0) # Important: move to the start of the BytesIO object\n return img_byte_arr\n @patch(\"requests.get\")\n def test_valid_image_url(self, mock_get):\n \"\"\"\n Test if the function correctly processes a valid image URL and returns a matplotlib Axes object with the correct title.\n \"\"\"\n mock_img = self.create_mock_image()\n mock_get.return_value = Mock(ok=True)\n mock_get.return_value.raw = mock_img\n ax = f_859(\"https://www.google.com/images/srpr/logo11w.png\")\n self.assertIsInstance(\n ax,\n matplotlib.axes._axes.Axes,\n \"Return type should be matplotlib.axes._axes.Axes\",\n )\n self.assertEqual(\n ax.get_title(),\n \"Grayscale Histogram\",\n \"Histogram should have the title 'Grayscale Histogram'\",\n )\n @patch(\"requests.get\")\n def test_invalid_image_url(self, mock_get):\n \"\"\"\n Test if the function raises a ValueError when provided with an invalid URL.\n \"\"\"\n mock_get.side_effect = requests.exceptions.RequestException\n with self.assertRaises(ValueError):\n f_859(\"invalid_url\")\n @patch(\"requests.get\")\n def test_histogram_bins(self, mock_get):\n \"\"\"\n Test if the histogram generated by the function contains the correct number of bins.\n \"\"\"\n mock_img = self.create_mock_image()\n mock_get.return_value = Mock(ok=True)\n mock_get.return_value.raw = mock_img\n ax = f_859(\"https://www.google.com/images/srpr/logo11w.png\")\n n, bins, _ = ax.hist([], bins=256)\n self.assertEqual(len(bins), 257, \"There should be 257 bin edges for 256 bins\")\n @patch(\"requests.get\")\n def test_histogram_data_range(self, mock_get):\n \"\"\"\n Test if the data range of the histogram is appropriate for a grayscale image (0 to 255).\n \"\"\"\n mock_img = self.create_mock_image()\n mock_get.return_value = Mock(ok=True)\n mock_get.return_value.raw = mock_img\n ax = f_859(\"https://www.google.com/images/srpr/logo11w.png\")\n n, bins, _ = ax.hist([], bins=256)\n self.assertTrue(\n bins[0] >= 0 and bins[-1] <= 255, \"Data range should be between 0 and 255\"\n )\n @patch(\"requests.get\")\n def test_empty_url(self, mock_get):\n \"\"\"\n Test if the function raises a ValueError when provided with an empty URL string.\n \"\"\"\n mock_get.side_effect = requests.exceptions.RequestException\n with self.assertRaises(ValueError):\n f_859(\"\")\n @patch(\"requests.get\")\n @patch(\"PIL.Image.open\")\n def test_ioerror_image_processing(self, mock_image_open, mock_get):\n \"\"\"\n Test if the function raises an IOError when there is an error in processing the image.\n \"\"\"\n # Mock requests.get to return a valid response\n mock_get.return_value = MagicMock(ok=True)\n mock_get.return_value.raw = MagicMock()\n # Mock PIL.Image.open to raise IOError\n mock_image_open.side_effect = IOError(\"Mocked IOError\")\n with self.assertRaises(IOError) as context:\n f_859(\"https://www.example.com/image.jpg\")\n self.assertEqual(\n str(context.exception), \"Error processing the image: Mocked IOError\"\n )\n def tearDown(self):\n plt.close()", "apis": ["requests.get", "PIL.Image.open", "numpy.array", "matplotlib.pyplot.subplots", "requests.RequestException"], "libs": ["PIL", "numpy", "matplotlib", "requests"], "doc": {"description": ["Downloads an image from the specified URL, converts it to grayscale, and generates a histogram of its grayscale values."], "note": [], "params": ["url (str): The URL of the image to be downloaded. Must be a valid URL pointing to an image."], "returns": ["matplotlib.axes._axes.Axes: The Axes object of the generated histogram."], "reqs": ["requests", "PIL", "numpy", "matplotlib.pyplot"], "raises": ["ValueError: If the URL is invalid or if there's an error downloading the image. Error message will specify the download issue.", "IOError: If there's an error in opening or processing the downloaded image. Error message will specify the processing issue."], "example": [">>> ax = f_859(\"https://www.example.com/myimage.jpg\")", ">>> type(ax)", ""]}} +{"task_id": "f_339", "prompt": "import pandas as pd\nimport re\nimport random\n\n\ndef f_339(s: str, seed: int = 0) -> pd.DataFrame:\n \"\"\"\n Generate a Pandas DataFrame of products with their ID, quantity, code, price, product, and description\n based on a specified string of product data.\n\n The input string is expected to be divided into segments by newlines. Each segment is expected to\n be further split into parts by whitespace: ID, quantity, code, price, and a product description.\n The function will remove trailing whitespaces in each field and assign a product name per unique code.\n Product name is randomly sampled from: ['Apple', 'Banana', 'Orange', 'Pear', 'Grape'].\n The same product name will be assigned to each code for each input s, however different codes can be\n mapped to the same name.\n\n Parameters:\n - s (str): Product data string split by newline, then whitespace.\n Expected format per segment: ' '\n If incomplete, this function raises ValueError.\n - seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n - data_df (pd.DataFrame): DataFrame with columns: ['ID', 'Quantity', 'Code', 'Price', 'Product', 'Description'].\n Quantity and Price are expected to be integers.\n\n Requirements:\n - pandas\n - re\n - random\n\n Examples:\n >>> s = '1 10 A10B 100 This is a description with spaces'\n >>> df = f_339(s)\n >>> df\n ID Quantity Code Price Product Description\n 0 1 10 A10B 100 Pear This is a description with spaces\n\n >>> s = '1 10 A10B 100 This is a description with spaces\\\\n2 20 B20C 200 Another description example'\n >>> df = f_339(s)\n >>> df\n ID Quantity Code Price Product Description\n 0 1 10 A10B 100 Pear This is a description with spaces\n 1 2 20 B20C 200 Pear Another description example\n \"\"\"", "canonical_solution": "\n if not s:\n raise ValueError(\"Incomplete data provided.\")\n\n random.seed(seed)\n\n products = [\"Apple\", \"Banana\", \"Orange\", \"Pear\", \"Grape\"]\n code_to_product = dict()\n\n data_list = []\n segments = [segment.strip() for segment in s.split(\"\\n\")]\n for segment in segments:\n if segment:\n elements = re.split(r\"\\s+\", segment.strip(), 4)\n if len(elements) < 5:\n raise ValueError(\"Incomplete data provided.\")\n id, quantity, code, price, description = elements\n product = code_to_product.get(code, random.choice(products))\n data_list.append([id, quantity, code, price, product, description])\n df = pd.DataFrame(\n data_list, columns=[\"ID\", \"Quantity\", \"Code\", \"Price\", \"Product\", \"Description\"]\n )\n df[\"Quantity\"] = df[\"Quantity\"].astype(int)\n df[\"Price\"] = df[\"Price\"].astype(int)\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.df1 = pd.DataFrame(\n {\n \"ID\": [\"1\"],\n \"Quantity\": [\"10\"],\n \"Code\": [\"A10B\"],\n \"Price\": [\"100\"],\n \"Description\": [\"This is a description with spaces\"],\n }\n )\n self.df2 = pd.DataFrame(\n {\n \"ID\": [\"2\"],\n \"Quantity\": [\"15\"],\n \"Code\": [\"B20C\"],\n \"Price\": [\"200\"],\n \"Description\": [\"Another description with spaces\"],\n }\n )\n self.df_multiple = pd.concat([self.df1, self.df2]).reset_index(drop=True)\n for col in [\"Quantity\", \"Price\"]:\n self.df1[col] = self.df1[col].astype(int)\n self.df2[col] = self.df2[col].astype(int)\n self.df_multiple[col] = self.df_multiple[col].astype(int)\n def _test_most_columns(self, df1, df2):\n columns_to_test = [\"ID\", \"Quantity\", \"Code\", \"Price\", \"Description\"]\n for col in columns_to_test:\n pd.testing.assert_series_equal(df1[col], df2[col])\n def test_case_1(self):\n # Test basic structure and data correctness\n input_str = \"1 10 A10B 100 This is a description with spaces\"\n result = f_339(input_str)\n self.assertIsInstance(result, pd.DataFrame)\n self._test_most_columns(result, self.df1)\n def test_case_2(self):\n # Test multiline basic structure and correctness\n input_str = \"\\n\".join(\n [\n \"1 10 A10B 100 This is a description with spaces\",\n \"2 15 B20C 200 Another description with spaces\",\n ]\n )\n result = f_339(input_str)\n self._test_most_columns(result, self.df_multiple)\n def test_case_3(self):\n # Test multiline with trailing whitespaces\n input_str = \"\\n\".join(\n [\n \"1 10 A10B 100 This is a description with spaces \",\n \"2 15 B20C 200 Another description with spaces \",\n ]\n )\n result = f_339(input_str)\n self._test_most_columns(result, self.df_multiple)\n def test_case_4(self):\n # Test behavior with extra spaces in the input string\n input_str = \"\\n\".join(\n [\n \"1 10 A10B 100 This is a description with spaces\",\n \"2 15 B20C 200 Another description with spaces \",\n ]\n )\n result = f_339(input_str)\n self._test_most_columns(result, self.df_multiple)\n def test_case_5(self):\n # Test code to product mapping when there are duplicates\n input_str = \"\\n\".join(\n [\n \"1 10 A10B 100 This is a description with spaces\",\n \"2 15 A10B 200 Another description with spaces\",\n ]\n )\n result = f_339(input_str)\n product_names = result[\"Product\"]\n self.assertEqual(product_names.iloc[0], product_names.iloc[1])\n def test_case_6(self):\n # Test behavior with empty input string\n input_str = \"\"\n with self.assertRaises(ValueError):\n f_339(input_str)\n def test_case_7(self):\n # Test behavior with incomplete input string\n input_str = \"1 10\"\n with self.assertRaises(ValueError):\n f_339(input_str)", "apis": ["pandas.DataFrame", "re.split", "random.seed", "random.choice"], "libs": ["random", "re", "pandas"], "doc": {"description": ["Generate a Pandas DataFrame of products with their ID, quantity, code, price, product, and description", "based on a specified string of product data.", "The input string is expected to be divided into segments by newlines. Each segment is expected to", "be further split into parts by whitespace: ID, quantity, code, price, and a product description.", "The function will remove trailing whitespaces in each field and assign a product name per unique code.", "Product name is randomly sampled from: ['Apple', 'Banana', 'Orange', 'Pear', 'Grape'].", "The same product name will be assigned to each code for each input s, however different codes can be", "mapped to the same name.", ">>> s = '1 10 A10B 100 This is a description with spaces\\\\n2 20 B20C 200 Another description example'", ">>> df = f_339(s)", ">>> df", "ID Quantity Code Price Product Description", "0 1 10 A10B 100 Pear This is a description with spaces", "1 2 20 B20C 200 Pear Another description example"], "note": [], "params": ["s (str): Product data string split by newline, then whitespace.", "Expected format per segment: ' '", "If incomplete, this function raises ValueError.", "seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["data_df (pd.DataFrame): DataFrame with columns: ['ID', 'Quantity', 'Code', 'Price', 'Product', 'Description'].", "Quantity and Price are expected to be integers."], "reqs": ["pandas", "re", "random"], "raises": [], "example": ["Examples:", ">>> s = '1 10 A10B 100 This is a description with spaces'", ">>> df = f_339(s)", ">>> df", "ID Quantity Code Price Product Description", "0 1 10 A10B 100 Pear This is a description with spaces"]}} +{"task_id": "f_840", "prompt": "import urllib.request\nimport os\nimport zipfile\n\n# Constants\nTARGET_DIR = \"downloaded_files\"\nTARGET_ZIP_FILE = \"downloaded_files.zip\"\n\n\ndef f_840(url):\n \"\"\"\n Download and extract a zip file from a specified URL to a designated directory.\n\n Parameters:\n - url (str): The URL of the zip file.\n\n Returns:\n - str: The path of the directory where the contents of the zip file are extracted.\n\n Requirements:\n - urllib\n - os\n - zipfile\n\n Behavior:\n - If the target directory TARGET_DIR does not exist, it is created.\n - The zip file is downloaded from the given URL and saved locally as TARGET_ZIP_FILE.\n - The local zip file TARGET_ZIP_FILE is deleted after extraction.\n\n Error Handling:\n - The function does not explicitly handle errors that may occur during the download or extraction process.\n Errors such as a failed download, invalid URL, or corrupted zip file will result in an unhandled exception.\n\n Examples:\n >>> f_840(\"http://example.com/files.zip\")\n 'downloaded_files'\n \"\"\"", "canonical_solution": "\n os.makedirs(TARGET_DIR, exist_ok=True)\n\n # context = ssl._create_unverified_context()\n # urllib.request.urlretrieve(url, TARGET_ZIP_FILE, context=context)\n urllib.request.urlretrieve(url, TARGET_ZIP_FILE)\n\n with zipfile.ZipFile(TARGET_ZIP_FILE, \"r\") as zip_ref:\n zip_ref.extractall(TARGET_DIR)\n\n if os.path.exists(TARGET_ZIP_FILE):\n os.remove(TARGET_ZIP_FILE)\n\n return TARGET_DIR", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport os\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_840 function.\"\"\"\n def setUp(self):\n if not os.path.exists(TARGET_DIR):\n os.makedirs(TARGET_DIR)\n if os.path.exists(TARGET_DIR):\n shutil.rmtree(TARGET_DIR)\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_valid_zip_file(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function returns the correct directory path.\"\"\"\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n mock_zipfile.return_value.__enter__.return_value = MagicMock()\n result = f_840(url)\n mock_urlretrieve.assert_called_with(url, TARGET_ZIP_FILE)\n self.assertEqual(result, TARGET_DIR)\n self.assertTrue(os.path.exists(TARGET_DIR))\n @patch(\"urllib.request.urlretrieve\")\n def test_invalid_url(self, mock_urlretrieve):\n \"\"\"Test that the function raises an exception when the URL is invalid.\"\"\"\n mock_urlretrieve.side_effect = Exception\n url = \"https://invalid.url/invalid.zip\"\n with self.assertRaises(Exception):\n f_840(url)\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_non_zip_file(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function raises an exception when the URL does not point to a zip file.\"\"\"\n mock_zipfile.side_effect = zipfile.BadZipFile\n url = \"https://www.sample-videos.com/img/Sample-jpg-image-5mb.jpg\"\n with self.assertRaises(zipfile.BadZipFile):\n f_840(url)\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_cleanup(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function deletes the downloaded zip file after extraction.\"\"\"\n mock_zipfile.return_value.__enter__.return_value = MagicMock()\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n f_840(url)\n self.assertFalse(os.path.exists(TARGET_ZIP_FILE))\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_directory_creation(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function creates a directory to store the extracted files.\"\"\"\n mock_zipfile.return_value.__enter__.return_value = MagicMock()\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n f_840(url)\n self.assertTrue(os.path.exists(TARGET_DIR))\n self.assertTrue(os.path.isdir(TARGET_DIR))\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_zip_extraction_content(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function extracts the contents of the zip file.\"\"\"\n mock_extractall = MagicMock()\n mock_zipfile.return_value.__enter__.return_value.extractall = mock_extractall\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n f_840(url)\n mock_extractall.assert_called_once()\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"zipfile.ZipFile\")\n def test_file_removal(self, mock_zipfile, mock_urlretrieve):\n \"\"\"Test that the function deletes the downloaded zip file even if extraction fails.\"\"\"\n mock_zipfile.return_value.__enter__.return_value = MagicMock()\n url = \"https://www.sample-videos.com/zip/Sample-Zip-5mb.zip\"\n # Create a dummy file to simulate download\n open(TARGET_ZIP_FILE, \"a\").close()\n f_840(url)\n self.assertFalse(os.path.exists(TARGET_ZIP_FILE))\n def tearDown(self):\n if os.path.exists(TARGET_DIR):\n shutil.rmtree(TARGET_DIR)", "apis": ["zipfile.ZipFile", "urllib.request", "os.makedirs", "os.remove", "os.path", "os.path.exists", "urllib.request.urlretrieve"], "libs": ["zipfile", "urllib", "os"], "doc": {"description": ["Download and extract a zip file from a specified URL to a designated directory.", "Behavior:", "- If the target directory TARGET_DIR does not exist, it is created.", "- The zip file is downloaded from the given URL and saved locally as TARGET_ZIP_FILE.", "- The local zip file TARGET_ZIP_FILE is deleted after extraction.", "Error Handling:", "- The function does not explicitly handle errors that may occur during the download or extraction process.", "Errors such as a failed download, invalid URL, or corrupted zip file will result in an unhandled exception."], "note": [], "params": ["url (str): The URL of the zip file."], "returns": ["str: The path of the directory where the contents of the zip file are extracted."], "reqs": ["urllib", "os", "zipfile"], "raises": [], "example": ["Examples:", ">>> f_840(\"http://example.com/files.zip\")", "'downloaded_files'"]}} +{"task_id": "f_802", "prompt": "import string\nimport random\n\n\ndef f_802(text, seed=None):\n \"\"\"\n Transforms the input text by replacing each alphabetic character with a random letter,\n while preserving the case and non-alphabetic characters of the original text.\n\n Parameters:\n - text (str): The input text to be transformed.\n - seed (int, optional): Random seed for reproducibility. Defaults to None (not set).\n\n Returns:\n - str: A transformed string with random letters replacing the alphabetic characters of the input text,\n preserving non-alphabetic characters and the original case.\n\n Requirements:\n - string\n - random\n\n Notes:\n - Alphabet replacements are chosen from ascii characters of the same case as the original.\n\n Example:\n >>> text = 'Hello, world!'\n >>> f_802(text, 0)\n 'Mynbi, qpmzj!'\n \"\"\"", "canonical_solution": "\n def replace_with_random_char(c):\n if c.isalpha():\n if c.islower():\n return random.choice(string.ascii_lowercase)\n else:\n return random.choice(string.ascii_uppercase)\n return c\n\n if seed is not None:\n random.seed(seed)\n return \"\".join(replace_with_random_char(c) for c in text)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test single word\n input_text = \"Hello\"\n output_text = f_802(input_text, seed=1)\n self.assertTrue(\n all(oc.isalpha() == ic.isalpha() for oc, ic in zip(output_text, input_text))\n )\n self.assertEqual(len(output_text), len(input_text))\n def test_case_2(self):\n # Test multiple words and punctuation\n input_text = \"Hello, World!\"\n output_text = f_802(input_text, seed=2)\n self.assertTrue(\n all(oc.isalpha() == ic.isalpha() for oc, ic in zip(output_text, input_text))\n )\n self.assertEqual(len(output_text), len(input_text))\n def test_case_3(self):\n # Test empty string\n input_text = \"\"\n output_text = f_802(input_text, seed=3)\n self.assertEqual(output_text, \"\")\n def test_case_4(self):\n # Test case preservation\n input_text = \"HeLlO\"\n output_text = f_802(input_text, seed=4)\n self.assertTrue(\n all(\n oc.isupper() == ic.isupper() and oc.islower() == ic.islower()\n for oc, ic in zip(output_text, input_text)\n )\n )\n def test_case_5(self):\n # Test numbers, special characters\n input_text = \"1234!@#$\"\n output_text = f_802(input_text, seed=5)\n self.assertEqual(\n output_text, input_text\n ) # Numbers and special characters should remain unchanged\n def test_case_6(self):\n # Test random seed reproducibility\n input_text = \"Colorless green ideas sleep furiously.\"\n output1 = f_802(input_text, seed=123)\n output2 = f_802(input_text, seed=123)\n self.assertEqual(output1, output2)", "apis": ["string.ascii_uppercase", "string.ascii_lowercase", "random.seed", "random.choice"], "libs": ["random", "string"], "doc": {"description": ["Transforms the input text by replacing each alphabetic character with a random letter,", "while preserving the case and non-alphabetic characters of the original text.", "Notes:", "- Alphabet replacements are chosen from ascii characters of the same case as the original."], "note": [], "params": ["text (str): The input text to be transformed.", "seed (int, optional): Random seed for reproducibility. Defaults to None (not set)."], "returns": ["str: A transformed string with random letters replacing the alphabetic characters of the input text,", "preserving non-alphabetic characters and the original case."], "reqs": ["string", "random"], "raises": [], "example": [">>> text = 'Hello, world!'", ">>> f_802(text, 0)", "'Mynbi, qpmzj!'"]}} +{"task_id": "f_798", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nimport re\nfrom collections import Counter\n\n\ndef f_798(mystrings, text):\n \"\"\"\n Replace spaces in given words with underscores, then plots the frequency of each unique word.\n\n Parameters:\n - mystrings (list of str): List of words/phrases where spaces need to be replaced with underscores.\n - text (str): The text in which modifications are applied and word frequencies are calculated. Must not be empty.\n\n Returns:\n - matplotlib.axes.Axes: The Axes object of the plot.\n\n Raises:\n - ValueError: If the input text is empty.\n\n Requirements:\n - numpy\n - matplotlib\n - re\n - collections\n\n Notes:\n - All operations are case-insensitive.\n - The frequency plot displays each unique word on the x-axis in the order they appear after\n modification with its corresponding frequency on the y-axis.\n\n Examples:\n >>> ax = f_798(['Lorem ipsum', 'consectetur adipiscing'], 'Lorem ipsum dolor sit amet lorem Ipsum')\n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n if not text:\n raise ValueError(\"text cannot be empty.\")\n\n for word in mystrings:\n text = re.sub(word, word.replace(\" \", \"_\"), text, flags=re.IGNORECASE)\n\n word_counts = Counter(text.split())\n\n words, frequencies = zip(*word_counts.items())\n indices = np.arange(len(word_counts))\n\n fig, ax = plt.subplots()\n ax.bar(indices, frequencies)\n ax.set_xticks(indices)\n ax.set_xticklabels(words)\n\n return ax", "test": "import unittest\nimport matplotlib.axes\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n ax = f_798([\"hello\"], \"Hello world!\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n xtick_labels = [label.get_text() for label in ax.get_xticklabels()]\n self.assertTrue(\"hello\" in xtick_labels)\n self.assertTrue(\"world!\" in xtick_labels)\n self.assertEqual(ax.patches[0].get_height(), 1)\n def test_case_2(self):\n # Test underscore on basic case\n ax = f_798([\"hello world\"], \"Hello world!\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertEqual(ax.get_xticklabels()[0].get_text(), \"hello_world!\")\n self.assertEqual(ax.patches[0].get_height(), 1)\n def test_case_3(self):\n # Test no mystrings\n ax = f_798([], \"Hello world!\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n xtick_labels = [label.get_text() for label in ax.get_xticklabels()]\n self.assertTrue(\"Hello\" in xtick_labels)\n self.assertTrue(\"world!\" in xtick_labels)\n self.assertEqual(ax.patches[0].get_height(), 1)\n def test_case_4(self):\n # Test basic case with\n large_text = \"Lorem ipsum dolor sit amet \" * 10\n ax = f_798([\"Lorem ipsum\"], large_text)\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n xtick_labels = [label.get_text() for label in ax.get_xticklabels()]\n self.assertTrue(\"Lorem_ipsum\" in xtick_labels)\n def test_case_5(self):\n # Tests basic functionality with simple replacement and plotting.\n ax = f_798([\"hello world\"], \"Hello world!\")\n self.assertIsInstance(ax, matplotlib.axes.Axes)\n self.assertIn(\n \"hello_world!\", [label.get_text() for label in ax.get_xticklabels()]\n )\n self.assertEqual(ax.patches[0].get_height(), 1)\n def test_case_6(self):\n # Ensures case insensitivity in replacements.\n ax = f_798([\"Hello World\"], \"hello world! Hello world!\")\n self.assertIn(\n \"Hello_World!\", [label.get_text() for label in ax.get_xticklabels()]\n )\n self.assertEqual(ax.patches[0].get_height(), 2)\n def test_case_7(self):\n # Tests behavior when no replacements should occur.\n ax = f_798([\"not in text\"], \"Hello world!\")\n self.assertNotIn(\n \"not_in_text\", [label.get_text() for label in ax.get_xticklabels()]\n )\n def test_case_8(self):\n # Tests function behavior with empty strings and lists.\n with self.assertRaises(Exception):\n f_798([], \"\")\n def test_case_9(self):\n # Tests functionality with special characters and numbers in `mystrings` and `text`.\n ax = f_798([\"test 123\", \"#$%!\"], \"Test 123 is fun. #$%!\")\n self.assertIn(\"test_123\", [label.get_text() for label in ax.get_xticklabels()])\n self.assertIn(\"#$%!\", [label.get_text() for label in ax.get_xticklabels()])\n def test_case_10(self):\n # Tests handling of duplicates in `mystrings`.\n ax = f_798([\"duplicate\", \"duplicate\"], \"duplicate Duplicate DUPLICATE\")\n self.assertIn(\"duplicate\", [label.get_text() for label in ax.get_xticklabels()])\n self.assertEqual(ax.patches[0].get_height(), 3)", "apis": ["numpy.arange", "collections.Counter", "re.sub", "matplotlib.pyplot.subplots", "re.IGNORECASE"], "libs": ["re", "collections", "numpy", "matplotlib"], "doc": {"description": ["Replace spaces in given words with underscores, then plots the frequency of each unique word.", "Notes:", "- All operations are case-insensitive.", "- The frequency plot displays each unique word on the x-axis in the order they appear after", "modification with its corresponding frequency on the y-axis."], "note": [], "params": ["mystrings (list of str): List of words/phrases where spaces need to be replaced with underscores.", "text (str): The text in which modifications are applied and word frequencies are calculated. Must not be empty."], "returns": ["matplotlib.axes.Axes: The Axes object of the plot."], "reqs": ["numpy", "matplotlib", "re", "collections"], "raises": ["ValueError: If the input text is empty."], "example": ["Examples:", ">>> ax = f_798(['Lorem ipsum', 'consectetur adipiscing'], 'Lorem ipsum dolor sit amet lorem Ipsum')", ">>> type(ax)", ""]}} +{"task_id": "f_848", "prompt": "import urllib.request\nimport zipfile\nimport os\nimport urllib.error\n\n\ndef f_848(\n url: str,\n save_path: str = \"downloaded_file.zip\",\n extract_path: str = \"extracted_files\",\n) -> str:\n \"\"\"\n Downloads, extracts, and deletes a ZIP file from a specified URL.\n\n The function includes comprehensive error handling to manage issues such as invalid URLs, unreachable servers, corrupted ZIP files, and file I/O errors. In the event of a failure, it provides a descriptive error message.\n\n Parameters:\n - url (str): The URL of the ZIP file to be downloaded.\n - save_path (str, optional): The local file path where the ZIP file will be saved temporarily. Defaults to 'downloaded_file.zip'.\n - extract_path (str, optional): The directory where the ZIP file's contents will be extracted. Defaults to 'extracted_files'.\n\n Returns:\n - str: The path to the directory where the ZIP file's contents have been extracted. Returns an error message in case of failure.\n\n Raises:\n - urllib.error.URLError: If the URL is invalid or the server cannot be reached. \n In this case, the function returns a string in the format \"URL Error: [error reason]\".\n\n Requirements:\n - urllib\n - zipfile\n - os\n - urllib\n\n Example:\n >>> extracted_path = f_848('http://www.example.com/data.zip')\n >>> print(extracted_path)\n 'extracted_files'\n\n\n \"\"\"", "canonical_solution": " try:\n # Check if save_path already exists, if so, remove it\n if os.path.exists(save_path):\n os.remove(save_path)\n\n # Download the file from the URL\n urllib.request.urlretrieve(url, save_path)\n\n # Create the extraction directory if it doesn't exist\n if not os.path.exists(extract_path):\n os.makedirs(extract_path)\n\n # Extract the zip file\n with zipfile.ZipFile(save_path, \"r\") as zip_ref:\n zip_ref.extractall(extract_path)\n\n # Remove the downloaded zip file\n os.remove(save_path)\n\n return extract_path\n except urllib.error.URLError as e:\n return f\"URL Error: {e.reason}\"", "test": "import unittest\nimport os\nimport urllib.error\nimport shutil\nfrom pathlib import Path\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_848 function.\"\"\"\n base_path = \"mnt/data/f_848_data_chien\"\n def setUp(self):\n # Ensure the base path is absolute\n self.base_path = os.path.abspath(self.base_path)\n # Create base directory for test data\n if not os.path.exists(self.base_path):\n os.makedirs(self.base_path)\n def test_successful_download_and_extraction_sample_1(self):\n \"\"\"Test Case 1: Successful Download and Extraction of Sample 1\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-1.zip\"\n save_path = Path(self.base_path) / \"sample_1_download.zip\"\n extract_path = Path(self.base_path) / \"sample_1_extract\"\n result_path = f_848(url, save_path, extract_path)\n self.assertEqual(result_path, extract_path)\n self.assertTrue(os.path.exists(extract_path))\n self.assertFalse(os.path.exists(save_path))\n def test_successful_download_and_extraction_sample_2(self):\n \"\"\"Test Case 2: Successful Download and Extraction of Sample 2\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-2.zip\"\n save_path = Path(self.base_path) / \"sample_2_download.zip\"\n extract_path = Path(self.base_path) / \"sample_2_extract\"\n result_path = f_848(url, save_path, extract_path)\n self.assertEqual(result_path, extract_path)\n self.assertTrue(os.path.exists(extract_path))\n self.assertFalse(os.path.exists(save_path))\n def test_invalid_url(self):\n \"\"\"Test Case 3: Invalid URL\"\"\"\n url = \"https://invalidurl.com/nonexistent.zip\"\n save_path = Path(self.base_path) / \"invalid_url.zip\"\n extract_path = Path(self.base_path) / \"invalid_url_extract\"\n result = f_848(url, save_path, extract_path)\n self.assertTrue(result.startswith(\"URL Error:\"))\n def test_file_already_exists_at_save_path(self):\n \"\"\"Test Case 4: File Already Exists at Save Path\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-1.zip\"\n save_path = Path(self.base_path) / \"existing_file.zip\"\n extract_path = Path(self.base_path) / \"existing_file_extract\"\n # Create a dummy file at the save path\n with open(save_path, \"w\") as file:\n file.write(\"Dummy content\")\n result_path = f_848(url, save_path, extract_path)\n self.assertEqual(result_path, extract_path)\n self.assertFalse(os.path.exists(save_path))\n def test_extraction_path_already_exists(self):\n \"\"\"Test Case 5: Extraction Path Already Exists\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-2.zip\"\n save_path = Path(self.base_path) / \"extract_path_exists.zip\"\n extract_path = Path(self.base_path) / \"existing_extract_path\"\n # Create the extraction path directory\n if not os.path.exists(extract_path):\n os.makedirs(extract_path)\n result_path = f_848(url, save_path, extract_path)\n self.assertEqual(result_path, extract_path)\n @classmethod\n def tearDownClass(cls):\n # Clean up any files or directories created during the tests\n shutil.rmtree(cls.base_path, ignore_errors=True)\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["urllib.error", "urllib.request", "zipfile.ZipFile", "os.makedirs", "os.remove", "os.path", "os.path.exists", "urllib.request.urlretrieve"], "libs": ["zipfile", "urllib", "os"], "doc": {"description": ["Downloads, extracts, and deletes a ZIP file from a specified URL.", "The function includes comprehensive error handling to manage issues such as invalid URLs, unreachable servers, corrupted ZIP files, and file I/O errors. In the event of a failure, it provides a descriptive error message."], "note": [], "params": ["url (str): The URL of the ZIP file to be downloaded.", "save_path (str, optional): The local file path where the ZIP file will be saved temporarily. Defaults to 'downloaded_file.zip'.", "extract_path (str, optional): The directory where the ZIP file's contents will be extracted. Defaults to 'extracted_files'."], "returns": ["str: The path to the directory where the ZIP file's contents have been extracted. Returns an error message in case of failure."], "reqs": ["urllib", "zipfile", "os", "urllib"], "raises": ["urllib.error.URLError: If the URL is invalid or the server cannot be reached.", "In this case, the function returns a string in the format \"URL Error: [error reason]\"."], "example": [">>> extracted_path = f_848('http://www.example.com/data.zip')", ">>> print(extracted_path)", "'extracted_files'"]}} +{"task_id": "f_812", "prompt": "import numpy as np\nfrom sklearn.preprocessing import MinMaxScaler\nimport pandas as pd\n\n\ndef f_812(df: pd.DataFrame) -> pd.DataFrame:\n \"\"\"\n Computes the MinMax-normalized cumulative sum for each numeric column in the given DataFrame.\n\n Parameters:\n - df (pandas.DataFrame): The input DataFrame containing numerical values.\n\n Returns:\n - pd.DataFrame: A DataFrame where each column contains the normalized cumulative sum of the\n respective column in the input DataFrame, retaining the original column names.\n\n Raises:\n - TypeError: If the DataFrame contains non-numeric data types.\n - ValueError: If the DataFrame is empty or contains NaN values.\n\n Requirements:\n - pandas\n - numpy\n - sklearn\n\n Example:\n >>> input_df = pd.DataFrame({'A': [1, 2, 3], 'B': [3, 2, 1]})\n >>> output_df = f_812(input_df)\n >>> type(output_df)\n \n >>> output_df\n A B\n 0 0.0 0.000000\n 1 0.4 0.666667\n 2 1.0 1.000000\n \"\"\"", "canonical_solution": " if df.select_dtypes(include=np.number).shape[1] != df.shape[1]:\n raise TypeError(\"Input DataFrame contains non-numeric data types.\")\n if df.empty or df.isnull().values.any():\n raise ValueError(\"Input DataFrame is empty or contains NaN values.\")\n\n df_cumsum = df.cumsum()\n scaler = MinMaxScaler()\n df_norm_cumsum = pd.DataFrame(scaler.fit_transform(df_cumsum), columns=df.columns)\n\n return df_norm_cumsum", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def check_cumsum_and_scaling(self, input_df, expected_output):\n output = f_812(input_df)\n pd.testing.assert_frame_equal(\n output, expected_output, check_dtype=False, atol=1e-5\n )\n def test_incremental_values(self):\n before = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [3, 2, 1]})\n after = pd.DataFrame({\"A\": [0.0, 0.4, 1.0], \"B\": [0.0, 0.66666667, 1.0]})\n self.check_cumsum_and_scaling(before, after)\n self.assertEqual(set(before.columns), set(after.columns))\n def test_negative_numbers(self):\n before = pd.DataFrame({\"A\": [-1, -2, -3], \"B\": [-3, -2, -1]})\n after = pd.DataFrame({\"A\": [1.0, 0.6, 0.0], \"B\": [1.0, 0.33333333, 0.0]})\n self.check_cumsum_and_scaling(before, after)\n self.assertEqual(set(before.columns), set(after.columns))\n def test_all_zeros(self):\n before = pd.DataFrame({\"A\": [0, 0, 0], \"B\": [0, 0, 0]})\n after = pd.DataFrame({\"A\": [0.0, 0.0, 0.0], \"B\": [0.0, 0.0, 0.0]})\n self.check_cumsum_and_scaling(before, after)\n self.assertEqual(set(before.columns), set(after.columns))\n def test_same_numbers(self):\n before = pd.DataFrame({\"A\": [5, 5, 5], \"B\": [2, 2, 2]})\n after = pd.DataFrame({\"A\": [0.0, 0.5, 1.0], \"B\": [0.0, 0.5, 1.0]})\n self.check_cumsum_and_scaling(before, after)\n self.assertEqual(set(before.columns), set(after.columns))\n def test_non_numeric_data_raises(self):\n with self.assertRaises(TypeError):\n f_812(pd.DataFrame({\"A\": [\"one\", \"two\", \"three\"], \"B\": [1, 2, 3]}))\n def test_nan_values_raise(self):\n with self.assertRaises(ValueError):\n f_812(pd.DataFrame({\"A\": [1, np.nan, 3], \"B\": [3, 2, 1]}))\n def test_empty_dataframe(self):\n with self.assertRaises(ValueError):\n f_812(pd.DataFrame())", "apis": ["pandas.DataFrame", "numpy.number", "sklearn.preprocessing.MinMaxScaler"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Computes the MinMax-normalized cumulative sum for each numeric column in the given DataFrame."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame containing numerical values."], "returns": ["pd.DataFrame: A DataFrame where each column contains the normalized cumulative sum of the", "respective column in the input DataFrame, retaining the original column names."], "reqs": ["pandas", "numpy", "sklearn"], "raises": ["TypeError: If the DataFrame contains non-numeric data types.", "ValueError: If the DataFrame is empty or contains NaN values."], "example": [">>> input_df = pd.DataFrame({'A': [1, 2, 3], 'B': [3, 2, 1]})", ">>> output_df = f_812(input_df)", ">>> type(output_df)", "", ">>> output_df", "A B", "0 0.0 0.000000", "1 0.4 0.666667", "2 1.0 1.000000"]}} +{"task_id": "f_742", "prompt": "import pandas as pd\nimport numpy as np\n\ndef f_742(d):\n \"\"\"\n Calculate mean, sum, max, min and standard deviation for the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d.\"\n \n Parameters:\n d (list): A list of dictionaries.\n\n Returns:\n dict: A dictionary with keys as 'x', 'y', and 'z' and values as dictionaries of statistics.\n\n Raises:\n - ValueError: If input is not a list of dictionaries.\n\n Requirements:\n - pandas\n - numpy\n\n Examples:\n >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n >>> f_742(data)\n {'x': {'mean': 2.0, 'sum': 6, 'max': 3, 'min': 1, 'std': 0.816496580927726}, 'y': {'mean': 8.666666666666666, 'sum': 26, 'max': 15, 'min': 1, 'std': 5.792715732327589}, 'z': {'mean': 6.0, 'sum': 18, 'max': 7, 'min': 5, 'std': 0.816496580927726}}\n >>> f_742([])\n {'x': None, 'y': None, 'z': None}\n >>> f_742([{'a': 1}])\n {'x': None, 'y': None, 'z': None}\n \"\"\"", "canonical_solution": " if not isinstance(d, list) or any(not isinstance(item, dict) for item in d):\n raise ValueError(\"Input must be a list of dictionaries.\")\n \n if not d:\n return {key: None for key in ['x', 'y', 'z']}\n\n df = pd.DataFrame(d).fillna(0) # Replace missing values with 0 to allow computations\n stats = {}\n\n for key in ['x', 'y', 'z']:\n if key in df.columns:\n stats[key] = {\n 'mean': np.mean(df[key]),\n 'sum': np.sum(df[key]),\n 'max': np.max(df[key]),\n 'min': np.min(df[key]),\n 'std': np.std(df[key], ddof=0) # Population standard deviation\n }\n else:\n stats[key] = None\n\n return stats", "test": "# Test suite\nimport unittest\nclass TestCases(unittest.TestCase):\n def test_empty_list(self):\n self.assertEqual(f_742([]), {'x': None, 'y': None, 'z': None})\n def test_valid_input(self):\n data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]\n result = f_742(data)\n self.assertAlmostEqual(result['x']['mean'], 2.0)\n self.assertAlmostEqual(result['y']['mean'], 8.666666666666666)\n self.assertAlmostEqual(result['z']['mean'], 6.0)\n def test_invalid_input_type(self):\n with self.assertRaises(ValueError):\n f_742(\"not a list\")\n def test_partial_keys(self):\n data = [{'x': 1, 'y': 2}, {'y': 3, 'z': 4}]\n result = f_742(data)\n self.assertIsNotNone(result['x'])\n self.assertIsNotNone(result['y'])\n self.assertIsNotNone(result['z'])\n def test_all_keys_missing(self):\n data = [{'a': 1}, {'b': 2}]\n self.assertEqual(f_742(data), {'x': None, 'y': None, 'z': None})", "apis": ["numpy.sum", "pandas.DataFrame", "numpy.mean", "numpy.min", "numpy.std", "numpy.max"], "libs": ["pandas", "numpy"], "doc": {"description": ["Calculate mean, sum, max, min and standard deviation for the keys \"x,\" \"y\" and \"z\" from a list of dictionaries \"d.\""], "note": [], "params": ["d (list): A list of dictionaries."], "returns": ["dict: A dictionary with keys as 'x', 'y', and 'z' and values as dictionaries of statistics."], "reqs": ["pandas", "numpy"], "raises": ["ValueError: If input is not a list of dictionaries."], "example": ["Examples:", ">>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}]", ">>> f_742(data)", "{'x': {'mean': 2.0, 'sum': 6, 'max': 3, 'min': 1, 'std': 0.816496580927726}, 'y': {'mean': 8.666666666666666, 'sum': 26, 'max': 15, 'min': 1, 'std': 5.792715732327589}, 'z': {'mean': 6.0, 'sum': 18, 'max': 7, 'min': 5, 'std': 0.816496580927726}}", ">>> f_742([])", "{'x': None, 'y': None, 'z': None}", ">>> f_742([{'a': 1}])", "{'x': None, 'y': None, 'z': None}"]}} +{"task_id": "f_582", "prompt": "import pandas as pd\nfrom sklearn.cluster import KMeans\n\ndef f_582(x_list, y_list):\n \"\"\"\n Perform K-Means clustering on the given data by first turning it into a DataFrame with two columns \"x\" and \"y\" and then return the labels and centroids.\n\n Parameters:\n - x_list (list): List of data corresponding to 'x'\n - y_list (list): List of data corresponding to 'y'\n\n Returns:\n tuple: The labels and centroids as numpy arrays.\n - kmeans.labels_: A NumPy array where each element is the cluster label assigned to each data point. \n - kmeans.cluster_centers_: A NumPy array containing the coordinates of the cluster centers.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [2, 3, 4, 5, 6, 7]})\n >>> labels, centroids = f_582([1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7])\n \"\"\"", "canonical_solution": " df = pd.DataFrame({'x': x_list, 'y': y_list})\n kmeans = KMeans(n_clusters=2, random_state=0).fit(df)\n return kmeans.labels_, kmeans.cluster_centers_", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n labels, centroids = f_582([1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n self.assertEqual(labels[2], 0)\n self.assertEqual(labels[3], 1)\n self.assertEqual(labels[4], 1)\n self.assertEqual(labels[5], 1)\n self.assertEqual(centroids[0][0], 2.)\n self.assertEqual(centroids[0][1], 3.)\n self.assertEqual(centroids[1][0], 5.)\n self.assertEqual(centroids[1][1], 6.)\n def test_case_2(self):\n labels, centroids = f_582([1, 1, 1, 1, 1, 1], [2, 2, 2, 2, 2, 2])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n self.assertEqual(labels[2], 0)\n self.assertEqual(labels[3], 0)\n self.assertEqual(labels[4], 0)\n self.assertEqual(labels[5], 0)\n self.assertEqual(centroids[0][0], 1.)\n self.assertEqual(centroids[0][1], 2.)\n def test_case_3(self):\n labels, centroids = f_582([1, 2, 3, 4, 5, 6], [2, 2, 2, 2, 2, 2])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n self.assertEqual(labels[2], 0)\n self.assertEqual(labels[3], 1)\n self.assertEqual(labels[4], 1)\n self.assertEqual(labels[5], 1)\n self.assertEqual(centroids[0][0], 2.)\n self.assertEqual(centroids[0][1], 2.)\n self.assertEqual(centroids[1][0], 5.)\n self.assertEqual(centroids[1][1], 2.)\n def test_case_4(self):\n labels, centroids = f_582([0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n def test_case_5(self):\n labels, centroids = f_582([1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6])\n self.assertEqual(labels[0], 0)\n self.assertEqual(labels[1], 0)\n self.assertEqual(labels[2], 0)\n self.assertEqual(labels[3], 1)\n self.assertEqual(labels[4], 1)\n self.assertEqual(labels[5], 1)\n self.assertEqual(centroids[0][0], 2.)\n self.assertEqual(centroids[0][1], 2.)\n self.assertEqual(centroids[1][0], 5.)\n self.assertEqual(centroids[1][1], 5.)", "apis": ["pandas.DataFrame", "sklearn.cluster.KMeans"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Perform K-Means clustering on the given data by first turning it into a DataFrame with two columns \"x\" and \"y\" and then return the labels and centroids."], "note": [], "params": ["x_list (list): List of data corresponding to 'x'", "y_list (list): List of data corresponding to 'y'"], "returns": ["tuple: The labels and centroids as numpy arrays.", "kmeans.labels_: A NumPy array where each element is the cluster label assigned to each data point.", "kmeans.cluster_centers_: A NumPy array containing the coordinates of the cluster centers."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [2, 3, 4, 5, 6, 7]})", ">>> labels, centroids = f_582([1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7])"]}} +{"task_id": "f_412", "prompt": "from collections import defaultdict\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_412(data):\n \"\"\"\n Calculate statistical measurements (mean and standard deviation) of the values associated with\n each key in a list of dictionaries and visualize them with bar charts.\n\n Parameters:\n data (list): The list of dictionaries. Must not be empty. Each dictionary must have numeric values.\n The function raises ValueError if the input list is empty and TypeError if the input is not a\n list of dictionaries or contains non-numeric values.\n\n Returns:\n tuple:\n - dict: A dictionary with keys and their corresponding mean and standard deviation.\n - list: A list of matplotlib Axes objects for each key's visualization.\n\n Requirements:\n - numpy\n - matplotlib.pyplot\n - collections.defaultdict\n \n Example:\n >>> stats, axes = f_412([{'cat': 1, 'dog': 3}, {'cat' : 2, 'dog': 5}, {'cat' : 3, 'dog': 7}])\n >>> stats\n {'cat': {'mean': 2.0, 'std': 0.816496580927726}, 'dog': {'mean': 5.0, 'std': 1.632993161855452}}\n >>> axes\n [, ]\n \"\"\"", "canonical_solution": " if not data:\n raise ValueError(\"Input data is empty.\")\n if not isinstance(data, list) or not all(isinstance(d, dict) for d in data):\n raise TypeError(\"Input must be a list of dictionaries.\")\n for d in data:\n if not all(isinstance(value, (int, float)) for value in d.values()):\n raise TypeError(\"All values in the dictionaries must be numeric.\")\n\n stats = defaultdict(list)\n for d in data:\n for key, value in d.items():\n stats[key].append(value)\n\n result = {k: {\"mean\": np.mean(v), \"std\": np.std(v)} for k, v in stats.items()}\n\n # Visualization\n axes = []\n for key in result:\n fig, ax = plt.subplots()\n ax.bar(x=[\"mean\", \"std\"], height=result[key].values())\n ax.set_title(f\"Statistics of {key}\")\n ax.set_ylabel(\"Value\")\n axes.append(ax)\n\n return result, axes", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n data = [{\"cat\": 1, \"dog\": 3}, {\"cat\": 2, \"dog\": 5}, {\"cat\": 3, \"dog\": 7}]\n stats, axes = f_412(data)\n self.assertEqual(\n stats,\n {\n \"cat\": {\"mean\": 2.0, \"std\": 0.816496580927726},\n \"dog\": {\"mean\": 5.0, \"std\": 1.632993161855452},\n },\n )\n self.assertEqual(axes[0].get_title(), \"Statistics of cat\")\n self.assertEqual(axes[1].get_title(), \"Statistics of dog\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_2(self):\n # Test other keys (animals)\n data = [{\"bird\": 5, \"fish\": 10}, {\"bird\": 6, \"fish\": 8}, {\"bird\": 7, \"fish\": 9}]\n stats, axes = f_412(data)\n self.assertEqual(\n stats,\n {\n \"bird\": {\"mean\": 6.0, \"std\": 0.816496580927726},\n \"fish\": {\"mean\": 9.0, \"std\": 0.816496580927726},\n },\n )\n self.assertEqual(axes[0].get_title(), \"Statistics of bird\")\n self.assertEqual(axes[1].get_title(), \"Statistics of fish\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_3(self):\n # Test handling negatives\n data = [{\"cat\": -1, \"dog\": -3}, {\"cat\": -2, \"dog\": -5}, {\"cat\": -3, \"dog\": -7}]\n stats, axes = f_412(data)\n self.assertEqual(\n stats,\n {\n \"cat\": {\"mean\": -2.0, \"std\": 0.816496580927726},\n \"dog\": {\"mean\": -5.0, \"std\": 1.632993161855452},\n },\n )\n self.assertEqual(axes[0].get_title(), \"Statistics of cat\")\n self.assertEqual(axes[1].get_title(), \"Statistics of dog\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_4(self):\n # Test single input\n data = [{\"cat\": 1}]\n stats, axes = f_412(data)\n self.assertEqual(stats, {\"cat\": {\"mean\": 1.0, \"std\": 0.0}})\n self.assertEqual(axes[0].get_title(), \"Statistics of cat\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_5(self):\n # Test handling zero\n data = [{\"cat\": 0, \"dog\": 0}, {\"cat\": 0, \"dog\": 0}, {\"cat\": 0, \"dog\": 0}]\n stats, axes = f_412(data)\n self.assertEqual(\n stats, {\"cat\": {\"mean\": 0.0, \"std\": 0.0}, \"dog\": {\"mean\": 0.0, \"std\": 0.0}}\n )\n self.assertEqual(axes[0].get_title(), \"Statistics of cat\")\n self.assertEqual(axes[1].get_title(), \"Statistics of dog\")\n for ax, key in zip(axes, stats):\n heights = [rect.get_height() for rect in ax.patches]\n self.assertListEqual(heights, list(stats[key].values()))\n def test_case_6(self):\n # Test correct handling of empty input\n with self.assertRaises(ValueError):\n f_412([])\n def test_case_7(self):\n # Test correct handling of incorrect input types\n with self.assertRaises(TypeError):\n f_412(\"not a list\")\n with self.assertRaises(TypeError):\n f_412([123])\n with self.assertRaises(TypeError):\n f_412([{\"cat\": \"not numeric\"}])\n def test_case_8(self):\n # Test with a mix of positive and negative integers\n data = [\n {\"apple\": -2, \"banana\": 4},\n {\"apple\": -4, \"banana\": 6},\n {\"apple\": -6, \"banana\": 8},\n ]\n stats, _ = f_412(data)\n self.assertEqual(\n stats,\n {\n \"apple\": {\"mean\": -4.0, \"std\": 1.632993161855452},\n \"banana\": {\"mean\": 6.0, \"std\": 1.632993161855452},\n },\n )\n def test_case_9(self):\n # Test with floating point numbers\n data = [{\"x\": 0.5, \"y\": 1.5}, {\"x\": 2.5, \"y\": 3.5}, {\"x\": 4.5, \"y\": 5.5}]\n stats, _ = f_412(data)\n self.assertEqual(\n stats,\n {\n \"x\": {\"mean\": 2.5, \"std\": 1.632993161855452},\n \"y\": {\"mean\": 3.5, \"std\": 1.632993161855452},\n },\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "numpy.std", "collections.defaultdict", "numpy.mean"], "libs": ["collections", "numpy", "matplotlib"], "doc": {"description": ["Calculate statistical measurements (mean and standard deviation) of the values associated with", "each key in a list of dictionaries and visualize them with bar charts."], "note": [], "params": ["data (list): The list of dictionaries. Must not be empty. Each dictionary must have numeric values.", "The function raises ValueError if the input list is empty and TypeError if the input is not a", "list of dictionaries or contains non-numeric values."], "returns": ["tuple:", "dict: A dictionary with keys and their corresponding mean and standard deviation.", "list: A list of matplotlib Axes objects for each key's visualization."], "reqs": ["numpy", "matplotlib.pyplot", "collections.defaultdict"], "raises": [], "example": [">>> stats, axes = f_412([{'cat': 1, 'dog': 3}, {'cat' : 2, 'dog': 5}, {'cat' : 3, 'dog': 7}])", ">>> stats", "{'cat': {'mean': 2.0, 'std': 0.816496580927726}, 'dog': {'mean': 5.0, 'std': 1.632993161855452}}", ">>> axes", "[, ]"]}} {"task_id": "f_779", "prompt": "import unittest\nimport string\n\n# Constants\nALPHABET = list(string.ascii_lowercase)\n\ndef f_779(word: str) -> str:\n \"\"\"\n Sort the letters of a given word based on their position in the alphabet.\n \n Parameters:\n word (str): The input word consisting of lowercase alphabetic characters.\n \n Returns:\n str: The word with its letters sorted alphabetically.\n \n Requirements:\n - Utilizes the string library to define the alphabet.\n - Uses a constant ALPHABET list to represent the lowercase alphabet.\n \n Example:\n >>> f_779('cba')\n 'abc'\n >>> f_779('zyx')\n 'xyz'\n \"\"\"", "canonical_solution": " return ''.join(sorted(word))", "test": "class TestF748(unittest.TestCase):\n def test_case_1(self):\n # Testing with a word that's already sorted\n self.assertEqual(f_779('abc'), 'abc')\n def test_case_2(self):\n # Testing with a word that's in reverse order\n self.assertEqual(f_779('zyx'), 'xyz')\n def test_case_3(self):\n # Testing with a word that has duplicate letters\n self.assertEqual(f_779('aabbcc'), 'aabbcc')\n def test_case_4(self):\n # Testing with a single-letter word\n self.assertEqual(f_779('a'), 'a')\n def test_case_5(self):\n # Testing with an empty string\n self.assertEqual(f_779(''), '')", "apis": ["string.ascii_lowercase"], "libs": ["string"], "doc": {"description": ["Sort the letters of a given word based on their position in the alphabet."], "note": [], "params": ["word (str): The input word consisting of lowercase alphabetic characters."], "returns": ["str: The word with its letters sorted alphabetically."], "reqs": ["Utilizes the string library to define the alphabet.", "Uses a constant ALPHABET list to represent the lowercase alphabet."], "raises": [], "example": [">>> f_779('cba')", "'abc'", ">>> f_779('zyx')", "'xyz'"]}} -{"task_id": "f_368", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom collections import Counter\n\ndef f_368(student_grades, possible_grades=[\"A\", \"B\", \"C\", \"D\", \"F\"]):\n \"\"\"\n Create a report on students' grades in a class, including a count of each grade out of all possible grades\n and a bar chart. Note: Grades are case-insensitive but whitespace-sensitive. Those not in possible grades\n are ignored.\n\n Parameters:\n student_grades (list): List of student grades. Must not be empty.\n possible_grades (list, optional): List of possible grade values. Defaults to ['A', 'B', 'C', 'D', 'F'].\n\n Returns:\n Tuple[DataFrame, Axes]:\n - A pandas DataFrame with 'Grade' as the named index and their 'Count' as values.\n - A bar chart plot (matplotlib's Axes object) visualizing 'Grade Distribution', with 'Grade' on the\n x-axis and 'Number of Students' on the y-axis.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n - collections.Counter\n\n Example:\n >>> student_grades = ['A', 'B', 'B', 'C', 'A', 'D', 'F', 'B', 'A', 'C']\n >>> report_df, ax = f_368(student_grades)\n >>> type(ax)\n \n >>> report_df\n Count\n Grade \n A 3\n B 3\n C 2\n D 1\n F 1\n \"\"\"", "canonical_solution": " if not student_grades:\n raise ValueError(\"student_grades cannot be empty\")\n possible_grades = [*dict.fromkeys([g.upper() for g in possible_grades])]\n grade_counts = dict(Counter([g.upper() for g in student_grades]))\n report_data = {grade: grade_counts.get(grade, 0) for grade in possible_grades}\n report_df = pd.DataFrame.from_dict(report_data, orient=\"index\", columns=[\"Count\"])\n report_df.index.name = \"Grade\"\n\n ax = report_df.plot(kind=\"bar\", legend=False, title=\"Grade Distribution\")\n ax.set_ylabel(\"Number of Students\")\n ax.set_xlabel(\"Grade\")\n\n plt.tight_layout()\n\n return report_df, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def _validate_plot(self, ax):\n self.assertEqual(ax.get_title(), \"Grade Distribution\")\n self.assertEqual(ax.get_xlabel(), \"Grade\")\n self.assertEqual(ax.get_ylabel(), \"Number of Students\")\n def _test_helper(self, grades, expected_counts):\n expected_df = pd.DataFrame(\n {\"Count\": expected_counts}, index=[\"A\", \"B\", \"C\", \"D\", \"F\"]\n )\n expected_df.index.name = \"Grade\"\n report_df, ax = f_368(grades)\n pd.testing.assert_frame_equal(report_df, expected_df)\n self._validate_plot(ax)\n def test_case_1(self):\n # Test with a mix of grades\n self._test_helper(\n [\"A\", \"B\", \"B\", \"C\", \"A\", \"D\", \"F\", \"B\", \"A\", \"C\"], [3, 3, 2, 1, 1]\n )\n def test_case_2(self):\n # Test with only one type of grade\n self._test_helper([\"A\", \"A\", \"A\", \"A\", \"A\"], [5, 0, 0, 0, 0])\n def test_case_3(self):\n # Test with an empty list of grades\n with self.assertRaises(Exception):\n f_368([], [0, 0, 0, 0, 0])\n def test_case_4(self):\n # Test correctly ignoring invalid grades\n self._test_helper([\"A\", \"X\", \"Y\", \"Z\"], [1, 0, 0, 0, 0])\n def test_case_5(self):\n # Test custom grades\n grades = [\"A\", \"C\", \"G\", \"G\"]\n expected_counts = [1, 0, 1, 0, 0, 2]\n possible_grades = [\"A\", \"B\", \"C\", \"D\", \"F\", \"G\"]\n expected_df = pd.DataFrame(\n {\"Count\": expected_counts},\n index=[*dict.fromkeys(g.upper() for g in possible_grades)],\n )\n expected_df.index.name = \"Grade\"\n report_df, ax = f_368(grades, possible_grades=possible_grades)\n pd.testing.assert_frame_equal(report_df, expected_df)\n self._validate_plot(ax)\n def test_case_6(self):\n # Test case insensitivity\n self._test_helper([\"a\", \"b\", \"C\"], [1, 1, 1, 0, 0])\n def test_case_7(self):\n # Test whitespace sensitivity\n self._test_helper([\"A \", \"b\", \" C\"], [0, 1, 0, 0, 0])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame.from_dict", "matplotlib.pyplot.tight_layout", "pandas.DataFrame", "collections.Counter"], "libs": ["matplotlib", "pandas", "collections"], "doc": {"description": ["Create a report on students' grades in a class, including a count of each grade out of all possible grades", "and a bar chart. Note: Grades are case-insensitive but whitespace-sensitive. Those not in possible grades", "are ignored."], "note": [], "params": ["student_grades (list): List of student grades. Must not be empty.", "possible_grades (list, optional): List of possible grade values. Defaults to ['A', 'B', 'C', 'D', 'F']."], "returns": ["Tuple[DataFrame, Axes]:", "A pandas DataFrame with 'Grade' as the named index and their 'Count' as values.", "A bar chart plot (matplotlib's Axes object) visualizing 'Grade Distribution', with 'Grade' on the", "x-axis and 'Number of Students' on the y-axis."], "reqs": ["pandas", "matplotlib.pyplot", "collections.Counter"], "raises": [], "example": [">>> student_grades = ['A', 'B', 'B', 'C', 'A', 'D', 'F', 'B', 'A', 'C']", ">>> report_df, ax = f_368(student_grades)", ">>> type(ax)", "", ">>> report_df", "Count", "Grade", "A 3", "B 3", "C 2", "D 1", "F 1"]}} -{"task_id": "f_819", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_819(records: np.ndarray, random_seed: int = 0) -> pd.DataFrame:\n \"\"\"\n Randomly shuffle the given array's features, normalize its values, then convert to a DataFrame\n with shuffled feature names.\n\n Parameters:\n - records (np.ndarray): A 2D numpy array with each row as a record and each column as a feature.\n - random_seed (int, optional): Seed for random operations to ensure reproducibility.\n\n Returns:\n - pd.DataFrame: A pandas DataFrame containing the preprocessed data, with shuffled feature names.\n\n Raises:\n - ValueError: If records is not 2D.\n\n Requirements:\n - numpy\n - pandas\n - sklearn\n\n Notes:\n - This function normalizes data by subtracting the mean and scaling to unit variance.\n - Feature names are of format f{n}; for example, if the records have 5 features, feature\n names will be [\"f1\", \"f2\", \"f3\", \"f4\", \"f5\"] shuffled.\n\n Examples:\n >>> data = np.array([[1, 2, 3], [4, 5, 6]])\n >>> df = f_819(data, random_seed=42)\n >>> df.shape\n (2, 3)\n >>> df.columns\n Index(['f2', 'f3', 'f1'], dtype='object')\n >>> data = np.array([[-1, -2, -3, -4, -5], [0, 0, 0, 0, 0], [1, 2, 3, 4, 5]])\n >>> df = f_819(data, random_seed=24)\n >>> df\n f3 f1 f4 f5 f2\n 0 -1.224745 -1.224745 -1.224745 -1.224745 -1.224745\n 1 0.000000 0.000000 0.000000 0.000000 0.000000\n 2 1.224745 1.224745 1.224745 1.224745 1.224745\n \"\"\"", "canonical_solution": " if random_seed is not None:\n np.random.seed(random_seed)\n\n if not (records.ndim == 2):\n raise ValueError(\"Input must be a 2D numpy array.\")\n\n records_copy = records.copy()\n np.random.shuffle(records_copy.T)\n\n scaler = StandardScaler()\n normalized_records = scaler.fit_transform(records_copy)\n\n features = [f\"f{i+1}\" for i in range(records[0].shape[0])]\n np.random.shuffle(features)\n\n df = pd.DataFrame(normalized_records, columns=features)\n\n return df", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.data = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.expected_shape = (2, 5)\n def test_case_1(self):\n # Test basic shape and columns\n df = f_819(self.data, random_seed=1)\n self.assertEqual(df.shape, self.expected_shape)\n self.assertTrue(set(df.columns) == set([\"f1\", \"f2\", \"f3\", \"f4\", \"f5\"]))\n # assert last row values\n self.assertEqual(df.iloc[-1].tolist(), [1.0, 1.0, 1.0, 1.0, 1.0])\n self.assertEqual(df.iloc[0].tolist(), [-1.0, -1.0, -1.0, -1.0, -1.0])\n \n def test_case_2(self):\n # Test normalization\n df = f_819(self.data, random_seed=2)\n np.testing.assert_array_almost_equal(\n df.mean(axis=0), np.zeros(self.expected_shape[1]), decimal=5\n )\n np.testing.assert_array_almost_equal(\n df.std(axis=0, ddof=0), np.ones(self.expected_shape[1]), decimal=5\n )\n \n def test_case_3(self):\n # Test random seed effect\n df1 = f_819(self.data, random_seed=3)\n df2 = f_819(self.data, random_seed=3)\n pd.testing.assert_frame_equal(df1, df2)\n def test_case_4(self):\n # Test handling invalid inputs\n with self.assertRaises(ValueError):\n f_819(np.array([1, 2, 3]), random_seed=4)\n with self.assertRaises(ValueError):\n f_819(np.array([[1, 2, 3], [4, 5]], dtype=object), random_seed=4)\n def test_case_5(self):\n # Test handling zero variance\n data = np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]])\n df = f_819(data, random_seed=42)\n # In cases of zero variance, StandardScaler will set values to 0\n np.testing.assert_array_equal(df.values, np.zeros(data.shape))", "apis": ["numpy.ndarray", "numpy.random", "numpy.random.seed", "numpy.random.shuffle", "pandas.DataFrame", "sklearn.preprocessing.StandardScaler"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Randomly shuffle the given array's features, normalize its values, then convert to a DataFrame", "with shuffled feature names.", "Notes:", "- This function normalizes data by subtracting the mean and scaling to unit variance.", "- Feature names are of format f{n}; for example, if the records have 5 features, feature", "names will be [\"f1\", \"f2\", \"f3\", \"f4\", \"f5\"] shuffled."], "note": [], "params": ["records (np.ndarray): A 2D numpy array with each row as a record and each column as a feature.", "random_seed (int, optional): Seed for random operations to ensure reproducibility."], "returns": ["pd.DataFrame: A pandas DataFrame containing the preprocessed data, with shuffled feature names."], "reqs": ["numpy", "pandas", "sklearn"], "raises": ["ValueError: If records is not 2D."], "example": ["Examples:", ">>> data = np.array([[1, 2, 3], [4, 5, 6]])", ">>> df = f_819(data, random_seed=42)", ">>> df.shape", "(2, 3)", ">>> df.columns", "Index(['f2', 'f3', 'f1'], dtype='object')", ">>> data = np.array([[-1, -2, -3, -4, -5], [0, 0, 0, 0, 0], [1, 2, 3, 4, 5]])", ">>> df = f_819(data, random_seed=24)", ">>> df", "f3 f1 f4 f5 f2", "0 -1.224745 -1.224745 -1.224745 -1.224745 -1.224745", "1 0.000000 0.000000 0.000000 0.000000 0.000000", "2 1.224745 1.224745 1.224745 1.224745 1.224745"]}} -{"task_id": "f_751", "prompt": "import pandas as pd\nimport itertools\nfrom random import shuffle\n\ndef f_751(letters=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'], categories=['Category 1', 'Category 2', 'Category 3']):\n \"\"\"\n Create a Pandas DataFrame by associating each element from a list of letters to a category from a list of categories.\n The categories are randomly shuffled.\n\n Parameters:\n letters (List[str]): A list of letters to be included in the DataFrame. Default is ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'].\n categories (List[str]): A list of categories to be included in the DataFrame. Default is ['Category 1', 'Category 2', 'Category 3'].\n\n Returns:\n DataFrame: A Pandas DataFrame with two columns: 'Letter' and 'Category'. Each letter is randomly associated with a category.\n\n Requirements:\n - pandas\n - itertools\n - random.shuffle\n\n Example:\n >>> import random\n >>> random.seed(0)\n >>> df = f_751(['A', 'B'], ['Cat 1', 'Cat 2'])\n >>> print(df)\n Letter Category\n 0 A Cat 2\n 1 B Cat 1\n 2 A Cat 1\n 3 B Cat 2\n >>> random.seed(1)\n >>> df = f_751()\n >>> print(df.head())\n Letter Category\n 0 A Category 3\n 1 B Category 3\n 2 C Category 2\n 3 D Category 2\n 4 E Category 3\n \"\"\"", "canonical_solution": " \n flattened_list = list(itertools.chain(*[letters for _ in range(len(categories))]))\n expanded_categories = list(itertools.chain(*[[category] * len(letters) for category in categories]))\n shuffle(expanded_categories)\n\n df = pd.DataFrame({'Letter': flattened_list, 'Category': expanded_categories})\n\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Testing with default parameters\n df = f_751()\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 27) # 9 letters * 3 categories\n def test_case_2(self):\n # Testing with custom parameters\n df = f_751(['X', 'Y'], ['Cat 1'])\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 2) # 2 letters * 1 category\n def test_case_3(self):\n # Testing with empty categories list\n df = f_751(['X', 'Y'], [])\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 0) # 2 letters * 0 categories\n def test_case_4(self):\n # Testing with empty letters list\n df = f_751([], ['Cat 1', 'Cat 2'])\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 0) # 0 letters * 2 categories\n def test_case_5(self):\n # Testing with both empty lists\n df = f_751([], [])\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 0) # 0 letters * 0 categories", "apis": ["random.shuffle", "pandas.DataFrame", "itertools.chain"], "libs": ["pandas", "random", "itertools"], "doc": {"description": ["Create a Pandas DataFrame by associating each element from a list of letters to a category from a list of categories.", "The categories are randomly shuffled."], "note": [], "params": ["letters (List[str]): A list of letters to be included in the DataFrame. Default is ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'].", "categories (List[str]): A list of categories to be included in the DataFrame. Default is ['Category 1', 'Category 2', 'Category 3']."], "returns": ["DataFrame: A Pandas DataFrame with two columns: 'Letter' and 'Category'. Each letter is randomly associated with a category."], "reqs": ["pandas", "itertools", "random.shuffle"], "raises": [], "example": [">>> import random", ">>> random.seed(0)", ">>> df = f_751(['A', 'B'], ['Cat 1', 'Cat 2'])", ">>> print(df)", "Letter Category", "0 A Cat 2", "1 B Cat 1", "2 A Cat 1", "3 B Cat 2", ">>> random.seed(1)", ">>> df = f_751()", ">>> print(df.head())", "Letter Category", "0 A Category 3", "1 B Category 3", "2 C Category 2", "3 D Category 2", "4 E Category 3"]}} -{"task_id": "f_771", "prompt": "import numpy as np\nfrom scipy import stats\ndef f_771(word: str) -> np.ndarray:\n \"\"\"\n Calculate the difference between the ASCII values of each pair of adjacent letters in the input word.\n After calculating the difference, calculate the entropy of the differences.\n \n Requirements:\n - numpy\n - scipy.stats\n \n Parameters:\n - word (str): The input word as a string.\n \n Returns:\n - np.ndarray: A numpy array containing the difference between the ASCII values of each pair of adjacent letters in the word.\n - float: The entropy of the differences.\n \n Examples:\n >>> f_771('abcdef')\n (array([1, 1, 1, 1, 1]), 1.6094379124341005)\n >>> f_771('hello')\n (array([-3, 7, 0, 3]), -inf)\n \"\"\"", "canonical_solution": " if not word: # Handling the case for empty string\n return np.array([])\n word_ascii_values = np.array([ord(x) for x in word])\n difference = np.diff(word_ascii_values)\n entropy = stats.entropy(difference)\n \n return difference, entropy", "test": "import unittest\nclass TestF_771(unittest.TestCase):\n def test_case_1(self):\n result = f_771('abcdef')\n expected_diff = np.array([1, 1, 1, 1, 1])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], 1.6094379124341005)\n \n def test_case_2(self):\n result = f_771('hell')\n expected_diff = np.array([-3, 7, 0])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], -np.inf)\n \n def test_case_3(self):\n result = f_771('az')\n expected_diff = np.array([25])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], 0.0)\n \n def test_case_4(self):\n result = f_771('a')\n expected_diff = np.array([])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], 0.0)\n \n def test_case_5(self):\n result = f_771('i love Python')\n expected_diff = np.array([-73, 76, 3, 7, -17, -69, 48, 41, -5, -12, 7, -1])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], -np.inf)\n \n def test_case_6(self):\n result = f_771('Za')\n expected_diff = np.array([7])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], 0.0)\n def test_case_7(self):\n result = f_771('racecar')\n expected_diff = np.array([-17, 2, 2, -2, -2, 17])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], -np.inf)", "apis": ["scipy.stats.entropy", "numpy.array", "numpy.diff", "numpy.ndarray"], "libs": ["numpy", "scipy"], "doc": {"description": ["Calculate the difference between the ASCII values of each pair of adjacent letters in the input word.", "After calculating the difference, calculate the entropy of the differences."], "note": [], "params": ["word (str): The input word as a string."], "returns": ["np.ndarray: A numpy array containing the difference between the ASCII values of each pair of adjacent letters in the word.", "float: The entropy of the differences."], "reqs": ["numpy", "scipy.stats"], "raises": [], "example": ["Examples:", ">>> f_771('abcdef')", "(array([1, 1, 1, 1, 1]), 1.6094379124341005)", ">>> f_771('hello')", "(array([-3, 7, 0, 3]), -inf)"]}} -{"task_id": "f_374", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.optimize import curve_fit\n\n\ndef f_374(X, Y):\n \"\"\"\n Adjust a quadratic function to the given data (X, Y) and plot the data along with the fit.\n\n Parameters:\n - X (list or np.array): The X data points.\n - Y (list or np.array): The Y data points.\n\n Returns:\n tuple:\n - list: The optimized parameters of the quadratic function (a, b, c).\n - matplotlib.axes.Axes: The plot showing the data points and the quadratic fit.\n\n Requirements:\n - matplotlib.pyplot\n - scipy.optimize.curve_fit\n\n Example:\n >>> np.random.seed(42)\n >>> X = np.linspace(-10, 10, 100)\n >>> Y = 3*X**2 + 2*X + 1 + np.random.normal(0, 20, len(X))\n >>> params, ax = f_374(X, Y)\n >>> params\n [3.0366511660907975, 2.1379326607136035, -2.3233168384548284]\n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n def func(x, a, b, c):\n return a * x ** 2 + b * x + c\n\n popt, pcov = curve_fit(func, X, Y)\n\n fig, ax = plt.subplots()\n ax.scatter(X, Y)\n ax.plot(X, func(X, *popt), \"r-\")\n\n return list(popt), ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport itertools\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.random_seed = 42\n np.random.seed(self.random_seed)\n self.test_data = [\n (\n np.linspace(-10, 10, 100),\n 3 * np.linspace(-10, 10, 100) ** 2\n + 2 * np.linspace(-10, 10, 100)\n + 1\n + np.random.normal(0, 20, 100),\n ),\n (\n np.linspace(-5, 5, 100),\n -2 * np.linspace(-5, 5, 100) ** 2\n + 4 * np.linspace(-5, 5, 100)\n - 3\n + np.random.normal(0, 10, 100),\n ),\n (\n np.linspace(-100, 100, 100),\n 0.5 * np.linspace(-100, 100, 100) ** 2\n + 1 * np.linspace(-100, 100, 100)\n + 10\n + np.random.normal(0, 50, 100),\n ),\n (\n np.linspace(-1, 1, 100),\n 10 * np.linspace(-1, 1, 100) ** 2\n + 5 * np.linspace(-1, 1, 100)\n + 2\n + np.random.normal(0, 1, 100),\n ),\n ]\n def assertDataInPlot(self, X, Y, ax):\n xdata, ydata = ax.collections[0].get_offsets().T # Access scatter plot data\n self.assertTrue(np.array_equal(X, xdata))\n self.assertTrue(np.array_equal(Y, ydata))\n def test_case_1(self):\n # Test fitting a basic quadratic function with expected params near 3, 2.\n X, Y = self.test_data[0]\n params, ax = f_374(X, Y)\n self.assertTrue(len(params) == 3)\n self.assertDataInPlot(X, Y, ax)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertAlmostEqual(params[0], 3, places=0)\n self.assertAlmostEqual(params[1], 2, places=0)\n def test_case_2(self):\n # Test fitting a basic quadratic function with expected params near -2, 4.\n X, Y = self.test_data[1]\n params, ax = f_374(X, Y)\n self.assertTrue(len(params) == 3)\n self.assertDataInPlot(X, Y, ax)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertAlmostEqual(params[0], -2, places=0)\n self.assertAlmostEqual(params[1], 4, places=0)\n def test_case_3(self):\n # Test fitting a wide parabola with parameters (0.5, 1).\n X, Y = self.test_data[2]\n params, ax = f_374(X, Y)\n self.assertTrue(len(params) == 3)\n self.assertDataInPlot(X, Y, ax)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertAlmostEqual(params[0], 0.5, places=0)\n self.assertAlmostEqual(params[1], 1, places=0)\n def test_case_4(self):\n # Test fitting a steep parabola with high coefficients (10, 5).\n X, Y = self.test_data[3]\n params, ax = f_374(X, Y)\n self.assertTrue(len(params) == 3)\n self.assertDataInPlot(X, Y, ax)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertAlmostEqual(params[0], 10, places=0)\n self.assertAlmostEqual(params[1], 5, places=0)\n def test_case_5(self):\n # Test handling non-numeric data - convertable to int\n string_int_list = [\"1\", \"2\", \"3\"]\n int_list = [1, 2, 3]\n with self.assertRaises(TypeError):\n f_374(string_int_list, int_list)\n with self.assertRaises(TypeError):\n f_374(int_list, string_int_list)\n def test_case_6(self):\n # Test handling non-numeric data\n for X, Y in itertools.product([[\"a\", \"b\", \"c\"], [], np.array([])], repeat=2):\n with self.assertRaises(ValueError):\n f_374(X, Y)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["scipy.optimize.curve_fit", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "scipy"], "doc": {"description": ["Adjust a quadratic function to the given data (X, Y) and plot the data along with the fit."], "note": [], "params": ["X (list or np.array): The X data points.", "Y (list or np.array): The Y data points."], "returns": ["tuple:", "list: The optimized parameters of the quadratic function (a, b, c).", "matplotlib.axes.Axes: The plot showing the data points and the quadratic fit."], "reqs": ["matplotlib.pyplot", "scipy.optimize.curve_fit"], "raises": [], "example": [">>> np.random.seed(42)", ">>> X = np.linspace(-10, 10, 100)", ">>> Y = 3*X**2 + 2*X + 1 + np.random.normal(0, 20, len(X))", ">>> params, ax = f_374(X, Y)", ">>> params", "[3.0366511660907975, 2.1379326607136035, -2.3233168384548284]", ">>> type(ax)", ""]}} -{"task_id": "f_823", "prompt": "import pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_823(df):\n \"\"\"\n Plots the correlation matrix from numeric columns in a DataFrame and returns a DataFrame\n where the numeric columns are standardized to have mean 0 and variance 1.\n\n Parameters:\n df (pandas.DataFrame): Input DataFrame with columns of numeric data.\n\n Returns:\n pandas.DataFrame: Standardized DataFrame.\n matplotlib.figure.Figure: Figure object containing the heatmap of the correlation matrix.\n\n Requirements:\n - pandas\n - numpy\n - seaborn\n - matplotlib\n - sklearn\n\n Raises:\n - ValueError: If the DataFrame is empty or if no numeric columns are present.\n\n Notes:\n - Only numeric columns are considered for the heatmap. Non-numeric columns are ignored.\n\n Examples:\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n >>> standardized_df, fig = f_823(df)\n >>> standardized_df\n A B\n 0 -1.224745 -1.224745\n 1 0.000000 0.000000\n 2 1.224745 1.224745\n >>> type(fig)\n \n \"\"\"", "canonical_solution": " numeric_df = df.select_dtypes(include=[np.number])\n if numeric_df.empty:\n raise ValueError(\"No numeric columns present\")\n\n correlation = numeric_df.corr()\n fig, ax = plt.subplots()\n sns.heatmap(correlation, ax=ax)\n\n numeric_cols = numeric_df.columns\n scaler = StandardScaler()\n df[numeric_cols] = scaler.fit_transform(df[numeric_cols])\n\n return df, fig", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case with integer values\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n standardized_df, fig = f_823(df)\n self.assertTrue(np.allclose(standardized_df.mean(), 0))\n self.assertTrue(np.allclose(standardized_df.std(ddof=0), 1))\n self.assertTrue(isinstance(fig, plt.Figure))\n def test_case_2(self):\n # Test case with float values\n df = pd.DataFrame({\"X\": [1.1, 2.2, 3.3], \"Y\": [4.4, 5.5, 6.6]})\n standardized_df, fig = f_823(df)\n self.assertTrue(np.allclose(standardized_df.mean(), 0))\n self.assertTrue(np.allclose(standardized_df.std(ddof=0), 1))\n self.assertTrue(isinstance(fig, plt.Figure))\n def test_case_3(self):\n # Test case with negative values\n df = pd.DataFrame({\"A\": [-1, -2, -3], \"B\": [-4, -5, -6]})\n standardized_df, fig = f_823(df)\n self.assertTrue(np.allclose(standardized_df.mean(), 0))\n self.assertTrue(np.allclose(standardized_df.std(ddof=0), 1))\n self.assertTrue(isinstance(fig, plt.Figure))\n def test_case_4(self):\n # Test case with single column\n df = pd.DataFrame({\"A\": [1, 2, 3]})\n standardized_df, fig = f_823(df)\n self.assertTrue(np.allclose(standardized_df.mean(), 0))\n self.assertTrue(np.allclose(standardized_df.std(ddof=0), 1))\n self.assertTrue(isinstance(fig, plt.Figure))\n def test_case_5(self):\n # Test proper exception handling - no numeric columns\n df = pd.DataFrame({\"A\": [\"apple\", \"banana\", \"cherry\"]})\n with self.assertRaises(ValueError):\n f_823(df)\n def test_case_6(self):\n # Test proper exception handling - empty dataframe\n df = pd.DataFrame()\n with self.assertRaises(ValueError):\n f_823(df)\n def test_case_7(self):\n # Test ignoring non-numeric columns\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [\"x\", \"y\", \"z\"], \"C\": [4.5, 5.5, 6.5]})\n standardized_df, fig = f_823(df)\n self.assertTrue(\"B\" in standardized_df.columns)\n self.assertTrue(np.allclose(standardized_df[[\"A\", \"C\"]].mean(), 0))\n self.assertTrue(np.allclose(standardized_df[[\"A\", \"C\"]].std(ddof=0), 1))\n self.assertIsInstance(fig, plt.Figure)", "apis": ["seaborn.heatmap", "sklearn.preprocessing.StandardScaler", "matplotlib.pyplot.subplots", "numpy.number"], "libs": ["matplotlib", "numpy", "sklearn", "seaborn"], "doc": {"description": ["Plots the correlation matrix from numeric columns in a DataFrame and returns a DataFrame", "where the numeric columns are standardized to have mean 0 and variance 1.", "Notes:", "- Only numeric columns are considered for the heatmap. Non-numeric columns are ignored."], "note": [], "params": ["df (pandas.DataFrame): Input DataFrame with columns of numeric data."], "returns": ["pandas.DataFrame: Standardized DataFrame.", "matplotlib.figure.Figure: Figure object containing the heatmap of the correlation matrix."], "reqs": ["pandas", "numpy", "seaborn", "matplotlib", "sklearn"], "raises": ["ValueError: If the DataFrame is empty or if no numeric columns are present."], "example": ["Examples:", ">>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})", ">>> standardized_df, fig = f_823(df)", ">>> standardized_df", "A B", "0 -1.224745 -1.224745", "1 0.000000 0.000000", "2 1.224745 1.224745", ">>> type(fig)", ""]}} +{"task_id": "f_368", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom collections import Counter\n\ndef f_368(student_grades, possible_grades=[\"A\", \"B\", \"C\", \"D\", \"F\"]):\n \"\"\"\n Create a report on students' grades in a class, including a count of each grade out of all possible grades\n and a bar chart. Note: Grades are case-insensitive but whitespace-sensitive. Those not in possible grades\n are ignored.\n\n Parameters:\n student_grades (list): List of student grades. Must not be empty.\n possible_grades (list, optional): List of possible grade values. Defaults to ['A', 'B', 'C', 'D', 'F'].\n\n Returns:\n Tuple[DataFrame, Axes]:\n - A pandas DataFrame with 'Grade' as the named index and their 'Count' as values.\n - A bar chart plot (matplotlib's Axes object) visualizing 'Grade Distribution', with 'Grade' on the\n x-axis and 'Number of Students' on the y-axis.\n\n Requirements:\n - pandas\n - matplotlib.pyplot\n - collections.Counter\n\n Example:\n >>> student_grades = ['A', 'B', 'B', 'C', 'A', 'D', 'F', 'B', 'A', 'C']\n >>> report_df, ax = f_368(student_grades)\n >>> type(ax)\n \n >>> report_df\n Count\n Grade \n A 3\n B 3\n C 2\n D 1\n F 1\n \"\"\"", "canonical_solution": " if not student_grades:\n raise ValueError(\"student_grades cannot be empty\")\n possible_grades = [*dict.fromkeys([g.upper() for g in possible_grades])]\n grade_counts = dict(Counter([g.upper() for g in student_grades]))\n report_data = {grade: grade_counts.get(grade, 0) for grade in possible_grades}\n report_df = pd.DataFrame.from_dict(report_data, orient=\"index\", columns=[\"Count\"])\n report_df.index.name = \"Grade\"\n\n ax = report_df.plot(kind=\"bar\", legend=False, title=\"Grade Distribution\")\n ax.set_ylabel(\"Number of Students\")\n ax.set_xlabel(\"Grade\")\n\n plt.tight_layout()\n\n return report_df, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def _validate_plot(self, ax):\n self.assertEqual(ax.get_title(), \"Grade Distribution\")\n self.assertEqual(ax.get_xlabel(), \"Grade\")\n self.assertEqual(ax.get_ylabel(), \"Number of Students\")\n def _test_helper(self, grades, expected_counts):\n expected_df = pd.DataFrame(\n {\"Count\": expected_counts}, index=[\"A\", \"B\", \"C\", \"D\", \"F\"]\n )\n expected_df.index.name = \"Grade\"\n report_df, ax = f_368(grades)\n pd.testing.assert_frame_equal(report_df, expected_df)\n self._validate_plot(ax)\n def test_case_1(self):\n # Test with a mix of grades\n self._test_helper(\n [\"A\", \"B\", \"B\", \"C\", \"A\", \"D\", \"F\", \"B\", \"A\", \"C\"], [3, 3, 2, 1, 1]\n )\n def test_case_2(self):\n # Test with only one type of grade\n self._test_helper([\"A\", \"A\", \"A\", \"A\", \"A\"], [5, 0, 0, 0, 0])\n def test_case_3(self):\n # Test with an empty list of grades\n with self.assertRaises(Exception):\n f_368([], [0, 0, 0, 0, 0])\n def test_case_4(self):\n # Test correctly ignoring invalid grades\n self._test_helper([\"A\", \"X\", \"Y\", \"Z\"], [1, 0, 0, 0, 0])\n def test_case_5(self):\n # Test custom grades\n grades = [\"A\", \"C\", \"G\", \"G\"]\n expected_counts = [1, 0, 1, 0, 0, 2]\n possible_grades = [\"A\", \"B\", \"C\", \"D\", \"F\", \"G\"]\n expected_df = pd.DataFrame(\n {\"Count\": expected_counts},\n index=[*dict.fromkeys(g.upper() for g in possible_grades)],\n )\n expected_df.index.name = \"Grade\"\n report_df, ax = f_368(grades, possible_grades=possible_grades)\n pd.testing.assert_frame_equal(report_df, expected_df)\n self._validate_plot(ax)\n def test_case_6(self):\n # Test case insensitivity\n self._test_helper([\"a\", \"b\", \"C\"], [1, 1, 1, 0, 0])\n def test_case_7(self):\n # Test whitespace sensitivity\n self._test_helper([\"A \", \"b\", \" C\"], [0, 1, 0, 0, 0])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "matplotlib.pyplot.tight_layout", "collections.Counter", "pandas.DataFrame.from_dict"], "libs": ["collections", "pandas", "matplotlib"], "doc": {"description": ["Create a report on students' grades in a class, including a count of each grade out of all possible grades", "and a bar chart. Note: Grades are case-insensitive but whitespace-sensitive. Those not in possible grades", "are ignored."], "note": [], "params": ["student_grades (list): List of student grades. Must not be empty.", "possible_grades (list, optional): List of possible grade values. Defaults to ['A', 'B', 'C', 'D', 'F']."], "returns": ["Tuple[DataFrame, Axes]:", "A pandas DataFrame with 'Grade' as the named index and their 'Count' as values.", "A bar chart plot (matplotlib's Axes object) visualizing 'Grade Distribution', with 'Grade' on the", "x-axis and 'Number of Students' on the y-axis."], "reqs": ["pandas", "matplotlib.pyplot", "collections.Counter"], "raises": [], "example": [">>> student_grades = ['A', 'B', 'B', 'C', 'A', 'D', 'F', 'B', 'A', 'C']", ">>> report_df, ax = f_368(student_grades)", ">>> type(ax)", "", ">>> report_df", "Count", "Grade", "A 3", "B 3", "C 2", "D 1", "F 1"]}} +{"task_id": "f_819", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_819(records: np.ndarray, random_seed: int = 0) -> pd.DataFrame:\n \"\"\"\n Randomly shuffle the given array's features, normalize its values, then convert to a DataFrame\n with shuffled feature names.\n\n Parameters:\n - records (np.ndarray): A 2D numpy array with each row as a record and each column as a feature.\n - random_seed (int, optional): Seed for random operations to ensure reproducibility.\n\n Returns:\n - pd.DataFrame: A pandas DataFrame containing the preprocessed data, with shuffled feature names.\n\n Raises:\n - ValueError: If records is not 2D.\n\n Requirements:\n - numpy\n - pandas\n - sklearn\n\n Notes:\n - This function normalizes data by subtracting the mean and scaling to unit variance.\n - Feature names are of format f{n}; for example, if the records have 5 features, feature\n names will be [\"f1\", \"f2\", \"f3\", \"f4\", \"f5\"] shuffled.\n\n Examples:\n >>> data = np.array([[1, 2, 3], [4, 5, 6]])\n >>> df = f_819(data, random_seed=42)\n >>> df.shape\n (2, 3)\n >>> df.columns\n Index(['f2', 'f3', 'f1'], dtype='object')\n >>> data = np.array([[-1, -2, -3, -4, -5], [0, 0, 0, 0, 0], [1, 2, 3, 4, 5]])\n >>> df = f_819(data, random_seed=24)\n >>> df\n f3 f1 f4 f5 f2\n 0 -1.224745 -1.224745 -1.224745 -1.224745 -1.224745\n 1 0.000000 0.000000 0.000000 0.000000 0.000000\n 2 1.224745 1.224745 1.224745 1.224745 1.224745\n \"\"\"", "canonical_solution": " if random_seed is not None:\n np.random.seed(random_seed)\n\n if not (records.ndim == 2):\n raise ValueError(\"Input must be a 2D numpy array.\")\n\n records_copy = records.copy()\n np.random.shuffle(records_copy.T)\n\n scaler = StandardScaler()\n normalized_records = scaler.fit_transform(records_copy)\n\n features = [f\"f{i+1}\" for i in range(records[0].shape[0])]\n np.random.shuffle(features)\n\n df = pd.DataFrame(normalized_records, columns=features)\n\n return df", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.data = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.expected_shape = (2, 5)\n def test_case_1(self):\n # Test basic shape and columns\n df = f_819(self.data, random_seed=1)\n self.assertEqual(df.shape, self.expected_shape)\n self.assertTrue(set(df.columns) == set([\"f1\", \"f2\", \"f3\", \"f4\", \"f5\"]))\n # assert last row values\n self.assertEqual(df.iloc[-1].tolist(), [1.0, 1.0, 1.0, 1.0, 1.0])\n self.assertEqual(df.iloc[0].tolist(), [-1.0, -1.0, -1.0, -1.0, -1.0])\n \n def test_case_2(self):\n # Test normalization\n df = f_819(self.data, random_seed=2)\n np.testing.assert_array_almost_equal(\n df.mean(axis=0), np.zeros(self.expected_shape[1]), decimal=5\n )\n np.testing.assert_array_almost_equal(\n df.std(axis=0, ddof=0), np.ones(self.expected_shape[1]), decimal=5\n )\n \n def test_case_3(self):\n # Test random seed effect\n df1 = f_819(self.data, random_seed=3)\n df2 = f_819(self.data, random_seed=3)\n pd.testing.assert_frame_equal(df1, df2)\n def test_case_4(self):\n # Test handling invalid inputs\n with self.assertRaises(ValueError):\n f_819(np.array([1, 2, 3]), random_seed=4)\n with self.assertRaises(ValueError):\n f_819(np.array([[1, 2, 3], [4, 5]], dtype=object), random_seed=4)\n def test_case_5(self):\n # Test handling zero variance\n data = np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]])\n df = f_819(data, random_seed=42)\n # In cases of zero variance, StandardScaler will set values to 0\n np.testing.assert_array_equal(df.values, np.zeros(data.shape))", "apis": ["numpy.random.shuffle", "pandas.DataFrame", "numpy.random", "sklearn.preprocessing.StandardScaler", "numpy.ndarray", "numpy.random.seed"], "libs": ["pandas", "numpy", "sklearn"], "doc": {"description": ["Randomly shuffle the given array's features, normalize its values, then convert to a DataFrame", "with shuffled feature names.", "Notes:", "- This function normalizes data by subtracting the mean and scaling to unit variance.", "- Feature names are of format f{n}; for example, if the records have 5 features, feature", "names will be [\"f1\", \"f2\", \"f3\", \"f4\", \"f5\"] shuffled."], "note": [], "params": ["records (np.ndarray): A 2D numpy array with each row as a record and each column as a feature.", "random_seed (int, optional): Seed for random operations to ensure reproducibility."], "returns": ["pd.DataFrame: A pandas DataFrame containing the preprocessed data, with shuffled feature names."], "reqs": ["numpy", "pandas", "sklearn"], "raises": ["ValueError: If records is not 2D."], "example": ["Examples:", ">>> data = np.array([[1, 2, 3], [4, 5, 6]])", ">>> df = f_819(data, random_seed=42)", ">>> df.shape", "(2, 3)", ">>> df.columns", "Index(['f2', 'f3', 'f1'], dtype='object')", ">>> data = np.array([[-1, -2, -3, -4, -5], [0, 0, 0, 0, 0], [1, 2, 3, 4, 5]])", ">>> df = f_819(data, random_seed=24)", ">>> df", "f3 f1 f4 f5 f2", "0 -1.224745 -1.224745 -1.224745 -1.224745 -1.224745", "1 0.000000 0.000000 0.000000 0.000000 0.000000", "2 1.224745 1.224745 1.224745 1.224745 1.224745"]}} +{"task_id": "f_751", "prompt": "import pandas as pd\nimport itertools\nfrom random import shuffle\n\ndef f_751(letters=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'], categories=['Category 1', 'Category 2', 'Category 3']):\n \"\"\"\n Create a Pandas DataFrame by associating each element from a list of letters to a category from a list of categories.\n The categories are randomly shuffled.\n\n Parameters:\n letters (List[str]): A list of letters to be included in the DataFrame. Default is ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'].\n categories (List[str]): A list of categories to be included in the DataFrame. Default is ['Category 1', 'Category 2', 'Category 3'].\n\n Returns:\n DataFrame: A Pandas DataFrame with two columns: 'Letter' and 'Category'. Each letter is randomly associated with a category.\n\n Requirements:\n - pandas\n - itertools\n - random.shuffle\n\n Example:\n >>> import random\n >>> random.seed(0)\n >>> df = f_751(['A', 'B'], ['Cat 1', 'Cat 2'])\n >>> print(df)\n Letter Category\n 0 A Cat 2\n 1 B Cat 1\n 2 A Cat 1\n 3 B Cat 2\n >>> random.seed(1)\n >>> df = f_751()\n >>> print(df.head())\n Letter Category\n 0 A Category 3\n 1 B Category 3\n 2 C Category 2\n 3 D Category 2\n 4 E Category 3\n \"\"\"", "canonical_solution": " \n flattened_list = list(itertools.chain(*[letters for _ in range(len(categories))]))\n expanded_categories = list(itertools.chain(*[[category] * len(letters) for category in categories]))\n shuffle(expanded_categories)\n\n df = pd.DataFrame({'Letter': flattened_list, 'Category': expanded_categories})\n\n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Testing with default parameters\n df = f_751()\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 27) # 9 letters * 3 categories\n def test_case_2(self):\n # Testing with custom parameters\n df = f_751(['X', 'Y'], ['Cat 1'])\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 2) # 2 letters * 1 category\n def test_case_3(self):\n # Testing with empty categories list\n df = f_751(['X', 'Y'], [])\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 0) # 2 letters * 0 categories\n def test_case_4(self):\n # Testing with empty letters list\n df = f_751([], ['Cat 1', 'Cat 2'])\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 0) # 0 letters * 2 categories\n def test_case_5(self):\n # Testing with both empty lists\n df = f_751([], [])\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(set(df.columns), {'Letter', 'Category'})\n self.assertEqual(len(df), 0) # 0 letters * 0 categories", "apis": ["pandas.DataFrame", "itertools.chain", "random.shuffle"], "libs": ["random", "itertools", "pandas"], "doc": {"description": ["Create a Pandas DataFrame by associating each element from a list of letters to a category from a list of categories.", "The categories are randomly shuffled."], "note": [], "params": ["letters (List[str]): A list of letters to be included in the DataFrame. Default is ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'].", "categories (List[str]): A list of categories to be included in the DataFrame. Default is ['Category 1', 'Category 2', 'Category 3']."], "returns": ["DataFrame: A Pandas DataFrame with two columns: 'Letter' and 'Category'. Each letter is randomly associated with a category."], "reqs": ["pandas", "itertools", "random.shuffle"], "raises": [], "example": [">>> import random", ">>> random.seed(0)", ">>> df = f_751(['A', 'B'], ['Cat 1', 'Cat 2'])", ">>> print(df)", "Letter Category", "0 A Cat 2", "1 B Cat 1", "2 A Cat 1", "3 B Cat 2", ">>> random.seed(1)", ">>> df = f_751()", ">>> print(df.head())", "Letter Category", "0 A Category 3", "1 B Category 3", "2 C Category 2", "3 D Category 2", "4 E Category 3"]}} +{"task_id": "f_771", "prompt": "import numpy as np\nfrom scipy import stats\ndef f_771(word: str) -> np.ndarray:\n \"\"\"\n Calculate the difference between the ASCII values of each pair of adjacent letters in the input word.\n After calculating the difference, calculate the entropy of the differences.\n \n Requirements:\n - numpy\n - scipy.stats\n \n Parameters:\n - word (str): The input word as a string.\n \n Returns:\n - np.ndarray: A numpy array containing the difference between the ASCII values of each pair of adjacent letters in the word.\n - float: The entropy of the differences.\n \n Examples:\n >>> f_771('abcdef')\n (array([1, 1, 1, 1, 1]), 1.6094379124341005)\n >>> f_771('hello')\n (array([-3, 7, 0, 3]), -inf)\n \"\"\"", "canonical_solution": " if not word: # Handling the case for empty string\n return np.array([])\n word_ascii_values = np.array([ord(x) for x in word])\n difference = np.diff(word_ascii_values)\n entropy = stats.entropy(difference)\n \n return difference, entropy", "test": "import unittest\nclass TestF_771(unittest.TestCase):\n def test_case_1(self):\n result = f_771('abcdef')\n expected_diff = np.array([1, 1, 1, 1, 1])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], 1.6094379124341005)\n \n def test_case_2(self):\n result = f_771('hell')\n expected_diff = np.array([-3, 7, 0])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], -np.inf)\n \n def test_case_3(self):\n result = f_771('az')\n expected_diff = np.array([25])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], 0.0)\n \n def test_case_4(self):\n result = f_771('a')\n expected_diff = np.array([])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], 0.0)\n \n def test_case_5(self):\n result = f_771('i love Python')\n expected_diff = np.array([-73, 76, 3, 7, -17, -69, 48, 41, -5, -12, 7, -1])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], -np.inf)\n \n def test_case_6(self):\n result = f_771('Za')\n expected_diff = np.array([7])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], 0.0)\n def test_case_7(self):\n result = f_771('racecar')\n expected_diff = np.array([-17, 2, 2, -2, -2, 17])\n np.testing.assert_array_equal(result[0], expected_diff)\n self.assertEqual(result[1], -np.inf)", "apis": ["scipy.stats.entropy", "numpy.ndarray", "numpy.diff", "numpy.array"], "libs": ["numpy", "scipy"], "doc": {"description": ["Calculate the difference between the ASCII values of each pair of adjacent letters in the input word.", "After calculating the difference, calculate the entropy of the differences."], "note": [], "params": ["word (str): The input word as a string."], "returns": ["np.ndarray: A numpy array containing the difference between the ASCII values of each pair of adjacent letters in the word.", "float: The entropy of the differences."], "reqs": ["numpy", "scipy.stats"], "raises": [], "example": ["Examples:", ">>> f_771('abcdef')", "(array([1, 1, 1, 1, 1]), 1.6094379124341005)", ">>> f_771('hello')", "(array([-3, 7, 0, 3]), -inf)"]}} +{"task_id": "f_374", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.optimize import curve_fit\n\n\ndef f_374(X, Y):\n \"\"\"\n Adjust a quadratic function to the given data (X, Y) and plot the data along with the fit.\n\n Parameters:\n - X (list or np.array): The X data points.\n - Y (list or np.array): The Y data points.\n\n Returns:\n tuple:\n - list: The optimized parameters of the quadratic function (a, b, c).\n - matplotlib.axes.Axes: The plot showing the data points and the quadratic fit.\n\n Requirements:\n - matplotlib.pyplot\n - scipy.optimize.curve_fit\n\n Example:\n >>> np.random.seed(42)\n >>> X = np.linspace(-10, 10, 100)\n >>> Y = 3*X**2 + 2*X + 1 + np.random.normal(0, 20, len(X))\n >>> params, ax = f_374(X, Y)\n >>> params\n [3.0366511660907975, 2.1379326607136035, -2.3233168384548284]\n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n def func(x, a, b, c):\n return a * x ** 2 + b * x + c\n\n popt, pcov = curve_fit(func, X, Y)\n\n fig, ax = plt.subplots()\n ax.scatter(X, Y)\n ax.plot(X, func(X, *popt), \"r-\")\n\n return list(popt), ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport itertools\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.random_seed = 42\n np.random.seed(self.random_seed)\n self.test_data = [\n (\n np.linspace(-10, 10, 100),\n 3 * np.linspace(-10, 10, 100) ** 2\n + 2 * np.linspace(-10, 10, 100)\n + 1\n + np.random.normal(0, 20, 100),\n ),\n (\n np.linspace(-5, 5, 100),\n -2 * np.linspace(-5, 5, 100) ** 2\n + 4 * np.linspace(-5, 5, 100)\n - 3\n + np.random.normal(0, 10, 100),\n ),\n (\n np.linspace(-100, 100, 100),\n 0.5 * np.linspace(-100, 100, 100) ** 2\n + 1 * np.linspace(-100, 100, 100)\n + 10\n + np.random.normal(0, 50, 100),\n ),\n (\n np.linspace(-1, 1, 100),\n 10 * np.linspace(-1, 1, 100) ** 2\n + 5 * np.linspace(-1, 1, 100)\n + 2\n + np.random.normal(0, 1, 100),\n ),\n ]\n def assertDataInPlot(self, X, Y, ax):\n xdata, ydata = ax.collections[0].get_offsets().T # Access scatter plot data\n self.assertTrue(np.array_equal(X, xdata))\n self.assertTrue(np.array_equal(Y, ydata))\n def test_case_1(self):\n # Test fitting a basic quadratic function with expected params near 3, 2.\n X, Y = self.test_data[0]\n params, ax = f_374(X, Y)\n self.assertTrue(len(params) == 3)\n self.assertDataInPlot(X, Y, ax)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertAlmostEqual(params[0], 3, places=0)\n self.assertAlmostEqual(params[1], 2, places=0)\n def test_case_2(self):\n # Test fitting a basic quadratic function with expected params near -2, 4.\n X, Y = self.test_data[1]\n params, ax = f_374(X, Y)\n self.assertTrue(len(params) == 3)\n self.assertDataInPlot(X, Y, ax)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertAlmostEqual(params[0], -2, places=0)\n self.assertAlmostEqual(params[1], 4, places=0)\n def test_case_3(self):\n # Test fitting a wide parabola with parameters (0.5, 1).\n X, Y = self.test_data[2]\n params, ax = f_374(X, Y)\n self.assertTrue(len(params) == 3)\n self.assertDataInPlot(X, Y, ax)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertAlmostEqual(params[0], 0.5, places=0)\n self.assertAlmostEqual(params[1], 1, places=0)\n def test_case_4(self):\n # Test fitting a steep parabola with high coefficients (10, 5).\n X, Y = self.test_data[3]\n params, ax = f_374(X, Y)\n self.assertTrue(len(params) == 3)\n self.assertDataInPlot(X, Y, ax)\n self.assertTrue(isinstance(ax, plt.Axes))\n self.assertAlmostEqual(params[0], 10, places=0)\n self.assertAlmostEqual(params[1], 5, places=0)\n def test_case_5(self):\n # Test handling non-numeric data - convertable to int\n string_int_list = [\"1\", \"2\", \"3\"]\n int_list = [1, 2, 3]\n with self.assertRaises(TypeError):\n f_374(string_int_list, int_list)\n with self.assertRaises(TypeError):\n f_374(int_list, string_int_list)\n def test_case_6(self):\n # Test handling non-numeric data\n for X, Y in itertools.product([[\"a\", \"b\", \"c\"], [], np.array([])], repeat=2):\n with self.assertRaises(ValueError):\n f_374(X, Y)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "scipy.optimize.curve_fit"], "libs": ["matplotlib", "scipy"], "doc": {"description": ["Adjust a quadratic function to the given data (X, Y) and plot the data along with the fit."], "note": [], "params": ["X (list or np.array): The X data points.", "Y (list or np.array): The Y data points."], "returns": ["tuple:", "list: The optimized parameters of the quadratic function (a, b, c).", "matplotlib.axes.Axes: The plot showing the data points and the quadratic fit."], "reqs": ["matplotlib.pyplot", "scipy.optimize.curve_fit"], "raises": [], "example": [">>> np.random.seed(42)", ">>> X = np.linspace(-10, 10, 100)", ">>> Y = 3*X**2 + 2*X + 1 + np.random.normal(0, 20, len(X))", ">>> params, ax = f_374(X, Y)", ">>> params", "[3.0366511660907975, 2.1379326607136035, -2.3233168384548284]", ">>> type(ax)", ""]}} +{"task_id": "f_823", "prompt": "import pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_823(df):\n \"\"\"\n Plots the correlation matrix from numeric columns in a DataFrame and returns a DataFrame\n where the numeric columns are standardized to have mean 0 and variance 1.\n\n Parameters:\n df (pandas.DataFrame): Input DataFrame with columns of numeric data.\n\n Returns:\n pandas.DataFrame: Standardized DataFrame.\n matplotlib.figure.Figure: Figure object containing the heatmap of the correlation matrix.\n\n Requirements:\n - pandas\n - numpy\n - seaborn\n - matplotlib\n - sklearn\n\n Raises:\n - ValueError: If the DataFrame is empty or if no numeric columns are present.\n\n Notes:\n - Only numeric columns are considered for the heatmap. Non-numeric columns are ignored.\n\n Examples:\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n >>> standardized_df, fig = f_823(df)\n >>> standardized_df\n A B\n 0 -1.224745 -1.224745\n 1 0.000000 0.000000\n 2 1.224745 1.224745\n >>> type(fig)\n \n \"\"\"", "canonical_solution": " numeric_df = df.select_dtypes(include=[np.number])\n if numeric_df.empty:\n raise ValueError(\"No numeric columns present\")\n\n correlation = numeric_df.corr()\n fig, ax = plt.subplots()\n sns.heatmap(correlation, ax=ax)\n\n numeric_cols = numeric_df.columns\n scaler = StandardScaler()\n df[numeric_cols] = scaler.fit_transform(df[numeric_cols])\n\n return df, fig", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case with integer values\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n standardized_df, fig = f_823(df)\n self.assertTrue(np.allclose(standardized_df.mean(), 0))\n self.assertTrue(np.allclose(standardized_df.std(ddof=0), 1))\n self.assertTrue(isinstance(fig, plt.Figure))\n def test_case_2(self):\n # Test case with float values\n df = pd.DataFrame({\"X\": [1.1, 2.2, 3.3], \"Y\": [4.4, 5.5, 6.6]})\n standardized_df, fig = f_823(df)\n self.assertTrue(np.allclose(standardized_df.mean(), 0))\n self.assertTrue(np.allclose(standardized_df.std(ddof=0), 1))\n self.assertTrue(isinstance(fig, plt.Figure))\n def test_case_3(self):\n # Test case with negative values\n df = pd.DataFrame({\"A\": [-1, -2, -3], \"B\": [-4, -5, -6]})\n standardized_df, fig = f_823(df)\n self.assertTrue(np.allclose(standardized_df.mean(), 0))\n self.assertTrue(np.allclose(standardized_df.std(ddof=0), 1))\n self.assertTrue(isinstance(fig, plt.Figure))\n def test_case_4(self):\n # Test case with single column\n df = pd.DataFrame({\"A\": [1, 2, 3]})\n standardized_df, fig = f_823(df)\n self.assertTrue(np.allclose(standardized_df.mean(), 0))\n self.assertTrue(np.allclose(standardized_df.std(ddof=0), 1))\n self.assertTrue(isinstance(fig, plt.Figure))\n def test_case_5(self):\n # Test proper exception handling - no numeric columns\n df = pd.DataFrame({\"A\": [\"apple\", \"banana\", \"cherry\"]})\n with self.assertRaises(ValueError):\n f_823(df)\n def test_case_6(self):\n # Test proper exception handling - empty dataframe\n df = pd.DataFrame()\n with self.assertRaises(ValueError):\n f_823(df)\n def test_case_7(self):\n # Test ignoring non-numeric columns\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [\"x\", \"y\", \"z\"], \"C\": [4.5, 5.5, 6.5]})\n standardized_df, fig = f_823(df)\n self.assertTrue(\"B\" in standardized_df.columns)\n self.assertTrue(np.allclose(standardized_df[[\"A\", \"C\"]].mean(), 0))\n self.assertTrue(np.allclose(standardized_df[[\"A\", \"C\"]].std(ddof=0), 1))\n self.assertIsInstance(fig, plt.Figure)", "apis": ["matplotlib.pyplot.subplots", "sklearn.preprocessing.StandardScaler", "numpy.number", "seaborn.heatmap"], "libs": ["sklearn", "seaborn", "numpy", "matplotlib"], "doc": {"description": ["Plots the correlation matrix from numeric columns in a DataFrame and returns a DataFrame", "where the numeric columns are standardized to have mean 0 and variance 1.", "Notes:", "- Only numeric columns are considered for the heatmap. Non-numeric columns are ignored."], "note": [], "params": ["df (pandas.DataFrame): Input DataFrame with columns of numeric data."], "returns": ["pandas.DataFrame: Standardized DataFrame.", "matplotlib.figure.Figure: Figure object containing the heatmap of the correlation matrix."], "reqs": ["pandas", "numpy", "seaborn", "matplotlib", "sklearn"], "raises": ["ValueError: If the DataFrame is empty or if no numeric columns are present."], "example": ["Examples:", ">>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})", ">>> standardized_df, fig = f_823(df)", ">>> standardized_df", "A B", "0 -1.224745 -1.224745", "1 0.000000 0.000000", "2 1.224745 1.224745", ">>> type(fig)", ""]}} {"task_id": "f_734", "prompt": "import random\nfrom collections import Counter\n\ndef f_734(strings: list) -> dict:\n \"\"\"\n Analyzes a given list of strings for the occurrence of a specific pattern and counts the occurrences.\n\n Parameters:\n - strings (list): A list of strings to be analyzed.\n\n Returns:\n dict: A dictionary with results of string analysis showing counts of the pattern.\n\n Requirements:\n - random\n - collections\n\n Example:\n >>> f_734(['abcd}def}', 'pqrs}tuv}', 'wxyz}123}', '456}789}', '0ab}cde}'])\n Counter({2: 10})\n \"\"\"", "canonical_solution": " if not strings:\n return Counter()\n\n pattern = '}'\n random_choices = random.choices(strings, k=10)\n pattern_counts = Counter([string.count(pattern) for string in random_choices])\n\n return pattern_counts", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n result = f_734(['abcd}def}', 'pqrs}tuv}', 'wxyz}123}', '456}789}', '0ab}cde}'])\n total_counts = sum(result.values())\n self.assertEqual(total_counts, 10)\n for key in result:\n self.assertTrue(1 <= key <= 2)\n def test_case_2(self):\n result = f_734(['abcd', 'pqrs', 'wxyz', '456', '0ab'])\n total_counts = sum(result.values())\n self.assertEqual(total_counts, 10)\n self.assertTrue(0 in result)\n self.assertEqual(result[0], 10)\n def test_case_3(self):\n result = f_734(['a}b}c}d', 'p}q}r}s', 'w}x}y}z', '4}5}6', '0}a}b'])\n total_counts = sum(result.values())\n self.assertEqual(total_counts, 10)\n for key in result:\n self.assertTrue(2 <= key <= 4)\n def test_case_4(self):\n result = f_734([])\n self.assertEqual(result, Counter())\n def test_case_5(self):\n result = f_734(['a}b}c}d}e}f}g}h}i}j}k}l}'])\n total_counts = sum(result.values())\n self.assertEqual(total_counts, 10)\n self.assertTrue(12 in result)\n self.assertEqual(result[12], 10)", "apis": ["collections.Counter", "random.choices"], "libs": ["collections", "random"], "doc": {"description": ["Analyzes a given list of strings for the occurrence of a specific pattern and counts the occurrences."], "note": [], "params": ["strings (list): A list of strings to be analyzed."], "returns": ["dict: A dictionary with results of string analysis showing counts of the pattern."], "reqs": ["random", "collections"], "raises": [], "example": [">>> f_734(['abcd}def}', 'pqrs}tuv}', 'wxyz}123}', '456}789}', '0ab}cde}'])", "Counter({2: 10})"]}} -{"task_id": "f_365", "prompt": "import numpy as np\nimport pandas as pd\n\n\ndef f_365(data_str, separator=\",\", bins=20):\n \"\"\"\n Convert a string of numerical values separated by a specified separator into a pandas\n integer series, and then draw a histogram of the data.\n\n The function raises a ValueError if data is empty or it fails to convert the data.\n It plots the histogram with the following attributes:\n - grid: True\n - rwidth: 0.9\n - color: '#607c8e'\n\n Parameters:\n - data_str (str): The string of numbers separated by the specified separator.\n - separator (str, optional): The separator used in the data string. Default is ','.\n - bins (int, optional): Number of histogram bins. Default is 20.\n\n Returns:\n - tuple: A tuple containing:\n 1. Series: A pandas Series of the data coonverted into integers.\n 2. Axes: The Axes object of the plotted histogram.\n\n Requirements:\n - numpy\n - pandas\n\n Example:\n >>> series, ax = f_365('1,2,3,4,5,5,5,4,3,2,1')\n >>> print(type(series), series.tolist())\n [1, 2, 3, 4, 5, 5, 5, 4, 3, 2, 1]\n >>> print(type(ax))\n \n \"\"\"", "canonical_solution": "\n data = np.fromstring(data_str, sep=separator)\n if data.size == 0:\n raise ValueError(\"Failed to find valid data\")\n\n data = pd.Series(data, dtype='int64')\n ax = data.plot.hist(grid=True, bins=bins, rwidth=0.9, color=\"#607c8e\")\n return data, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib\nfrom matplotlib import pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self) -> None:\n self.default_str = \"1,2,3,4,5,5,5,4,3,2,1\"\n self.default_expected = pd.Series([1, 2, 3, 4, 5, 5, 5, 4, 3, 2, 1])\n def assertHistogramAttributes(self, series, ax):\n # Check that the y-axis gridlines are set to True\n self.assertTrue(ax.yaxis.grid)\n # Ensure the histogram bars have the correct color\n self.assertEqual(matplotlib.colors.to_hex(ax.patches[0].get_fc()), \"#607c8e\")\n # Validate the heights of the histogram bars\n for patch in ax.patches:\n if (\n round(patch.get_x()) in series.values\n or round(patch.get_x() + patch.get_width()) in series.values\n ):\n self.assertTrue(patch.get_height() >= 0)\n def test_case_1(self):\n # Test default case\n series, ax = f_365(self.default_str)\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, self.default_expected)\n def test_case_2(self):\n # Test function works on different bin sizes\n for bins in [5, 10, 15, 30, 100]:\n with self.subTest(bins=bins):\n series, ax = f_365(self.default_str, bins=bins)\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, self.default_expected)\n def test_case_3(self):\n # Test custom separators\n data_str = \"1|2|3|4|5\"\n series, ax = f_365(data_str, separator=\"|\")\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, pd.Series([1, 2, 3, 4, 5]))\n def test_case_4(self):\n # Test negative and zero\n data_str = \"-5,-4,-3,-2,-1,0\"\n series, ax = f_365(data_str)\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, pd.Series([-5, -4, -3, -2, -1, 0]))\n def test_case_5(self):\n # Test single item\n data_str = \"1\"\n series, ax = f_365(data_str)\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, pd.Series([1]))\n def test_case_6(self):\n # Test with float\n series, ax = f_365(\"1.0,2.0,3.0,4.0,5.0,5.0,5.0,4.0,3.0,2.0,1.0\")\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, self.default_expected)\n def test_case_7(self):\n # Test with empty string\n data_str = \"\"\n with self.assertRaises(ValueError):\n f_365(data_str)\n def test_case_8(self):\n # Test with invalid data (contains string)\n data_str = \"a,b,c, 1\"\n with self.assertRaises(ValueError):\n f_365(data_str)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.Series", "numpy.fromstring"], "libs": ["numpy", "pandas"], "doc": {"description": ["Convert a string of numerical values separated by a specified separator into a pandas", "integer series, and then draw a histogram of the data.", "The function raises a ValueError if data is empty or it fails to convert the data.", "It plots the histogram with the following attributes:", "- grid: True", "- rwidth: 0.9", "- color: '#607c8e'"], "note": [], "params": ["data_str (str): The string of numbers separated by the specified separator.", "separator (str, optional): The separator used in the data string. Default is ','.", "bins (int, optional): Number of histogram bins. Default is 20."], "returns": ["tuple: A tuple containing:", "1. Series: A pandas Series of the data coonverted into integers.", "2. Axes: The Axes object of the plotted histogram."], "reqs": ["numpy", "pandas"], "raises": [], "example": [">>> series, ax = f_365('1,2,3,4,5,5,5,4,3,2,1')", ">>> print(type(series), series.tolist())", " [1, 2, 3, 4, 5, 5, 5, 4, 3, 2, 1]", ">>> print(type(ax))", ""]}} -{"task_id": "f_526", "prompt": "import shutil\nimport os\nimport fnmatch\nimport itertools\n\ndef f_526(src_dir, dst_dir):\n \"\"\"\n Copy all files from 'src_dir' to 'dst_dir' that match any pattern in ['*.txt', '*.docx'].\n\n Parameters:\n - src_dir (str): The source directory.\n - dst_dir (str): The destination directory.\n\n Returns:\n - str: The destination directory.\n \n Requirements:\n - shutil\n - os\n - fnmatch\n - itertools\n\n Example:\n >>> f_526('./source', './destination')\n >>> './destination'\n \"\"\"", "canonical_solution": " FILE_PATTERNS = ['*.txt', '*.docx']\n # Find all matching files\n matching_files = list(itertools.chain.from_iterable(\n fnmatch.filter(os.listdir(src_dir), pattern) for pattern in FILE_PATTERNS))\n\n for filename in matching_files:\n shutil.copy2(os.path.join(src_dir, filename), dst_dir)\n\n return dst_dir", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def base(self, src_dir, dst_dir):\n if os.path.exists(src_dir):\n shutil.rmtree(src_dir)\n # Create source directory\n os.mkdir(src_dir)\n # Create destination directory\n os.mkdir(dst_dir)\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join(src_dir, filename), 'w') as f:\n f.write('test')\n # Run function\n f_526(src_dir, dst_dir)\n # Check files\n for d in [src_dir, dst_dir]:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n if d == src_dir:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n else:\n self.assertFalse(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertFalse(os.path.exists(os.path.join(d, 'a.doc')))\n \n def tearDown(self):\n for d in ['./source', './destination', './src', './dst', './s', './d']:\n if os.path.exists(d):\n shutil.rmtree(d)\n def test_case_1(self):\n self.base('./source', './destination')\n \n def test_case_2(self):\n self.base('./src', './dst')\n \n def test_case_3(self):\n self.base('./s', './d')\n \n def test_case_4(self):\n self.base('./s', './destination')\n def test_case_5(self):\n self.base('./source', './d')", "apis": ["os.listdir", "itertools.chain", "fnmatch.filter", "os.path", "itertools.chain.from_iterable", "os.path.join", "shutil.copy2"], "libs": ["itertools", "fnmatch", "shutil", "os"], "doc": {"description": ["Copy all files from 'src_dir' to 'dst_dir' that match any pattern in ['*.txt', '*.docx']."], "note": [], "params": ["src_dir (str): The source directory.", "dst_dir (str): The destination directory."], "returns": ["str: The destination directory."], "reqs": ["shutil", "os", "fnmatch", "itertools"], "raises": [], "example": [">>> f_526('./source', './destination')", ">>> './destination'"]}} -{"task_id": "f_588", "prompt": "import pandas as pd\nfrom sklearn.cluster import DBSCAN\n\ndef f_588(data, cols):\n \"\"\"\n Perform DBSCAN clustering on the data by transforming it into a DataFrame and recording the clusters in a new column named 'Cluster'.\n Please choose the parameters eps=3 and min_samples=2.\n \n Parameters:\n - data (list): List of lists with the data, where the length of the inner list equals the number of columns\n - cols (list): List of column names\n \n Returns:\n - df (DataFrame): The DataFrame with a new 'Cluster' column.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> data = [[5.1, 3.5], [4.9, 3.0], [4.7, 3.2]]\n >>> cols = ['x', 'y']\n >>> df = f_588(data, cols)\n >>> print(df)\n x y Cluster\n 0 5.1 3.5 0\n 1 4.9 3.0 0\n 2 4.7 3.2 0\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data, columns=cols)\n dbscan = DBSCAN(eps=3, min_samples=2)\n df['Cluster'] = dbscan.fit_predict(df)\n return df", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = f_588([[5.1, 3.5], [4.9, 3.0], [4.7, 3.2]], ['x', 'y'])\n print(df)\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0])))\n def test_case_2(self):\n df = f_588([[1, 2], [3, 4], [5, 6]], ['x', 'y'])\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0])))\n def test_case_3(self):\n df = f_588([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], ['x', 'y'])\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0, 1, 1, -1])))\n def test_case_4(self):\n df = f_588([[1, 2, 3], [2, 2, 2], [2, 3, 4], [8, 7, 6], [8, 8, 8], [25, 80, 100]], ['x', 'y', 'z'])\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0, 1, 1, -1])))\n def test_case_5(self):\n df = f_588([[-1, -2], [-2, -2], [-2, -3], [-8, -7], [-8, -8], [-25, -80]], ['x', 'y'])\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0, 1, 1, -1])))", "apis": ["sklearn.cluster.DBSCAN", "pandas.DataFrame"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Perform DBSCAN clustering on the data by transforming it into a DataFrame and recording the clusters in a new column named 'Cluster'.", "Please choose the parameters eps=3 and min_samples=2."], "note": [], "params": ["data (list): List of lists with the data, where the length of the inner list equals the number of columns", "cols (list): List of column names"], "returns": ["df (DataFrame): The DataFrame with a new 'Cluster' column."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> data = [[5.1, 3.5], [4.9, 3.0], [4.7, 3.2]]", ">>> cols = ['x', 'y']", ">>> df = f_588(data, cols)", ">>> print(df)", "x y Cluster", "0 5.1 3.5 0", "1 4.9 3.0 0", "2 4.7 3.2 0"]}} -{"task_id": "f_894", "prompt": "import os\nimport hashlib\n\n# Constants\nDIRECTORY = \"./hashed_files\"\n\n\ndef f_894(input_string):\n \"\"\"\n Hash each non-empty line of a multi-line string using SHA256 and save the hashes to files.\n The filename is the first 10 characters of the hash, with a '.txt' extension.\n\n Parameters:\n - input_string (str): A multi-line string to be processed.\n\n Returns:\n - list[str]: A list of file paths where the hashes of non-empty lines are saved.\n\n Requirements:\n - os\n - hashlib\n\n Notes:\n - If the DIRECTORY does not exist, it is created.\n - Empty lines in the input string are ignored.\n\n Example:\n >>> file_paths = f_894('line a\\nfollows by line b\\n\\n...bye\\n')\n >>> print(file_paths)\n ['./hashed_files/489fe1fa6c.txt', './hashed_files/67009597fe.txt', './hashed_files/eab4758603.txt']\n \"\"\"", "canonical_solution": " if not os.path.exists(DIRECTORY):\n os.makedirs(DIRECTORY)\n\n file_paths = []\n lines = input_string.split(\"\\n\")\n for line in lines:\n if line: # Check if line is not empty\n line_hash = hashlib.sha256(line.encode()).hexdigest()\n filename = line_hash[:10] + \".txt\"\n filepath = os.path.join(DIRECTORY, filename)\n with open(filepath, \"w\", encoding=\"utf-8\") as file:\n file.write(line_hash)\n file_paths.append(filepath)\n\n return file_paths", "test": "import unittest\nimport os\nimport hashlib\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_894.\"\"\"\n @classmethod\n def setUpClass(cls):\n \"\"\"Set up a temporary directory for test files.\"\"\"\n cls.temp_directory = \"./temp_test_files\"\n os.makedirs(cls.temp_directory, exist_ok=True)\n @classmethod\n def tearDownClass(cls):\n \"\"\"Clean up by removing the temporary directory after tests.\"\"\"\n shutil.rmtree(cls.temp_directory)\n dirs_to_remove = [\"hashed_files\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)\n def test_single_line(self):\n \"\"\"Test with a single line input.\"\"\"\n input_string = \"Hello world\"\n expected = [os.path.join(\"./hashed_files\", \"64ec88ca00.txt\")]\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_multi_line(self):\n \"\"\"Test with a multi-line input.\"\"\"\n input_string = \"First line\\nSecond line\\nThird line\"\n expected = [\n os.path.join(\"./hashed_files\", \"2361df1018.txt\"),\n os.path.join(\"./hashed_files\", \"c8b588f708.txt\"),\n os.path.join(\"./hashed_files\", \"3195807ae4.txt\"),\n ]\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_empty_input(self):\n \"\"\"Test with an empty string.\"\"\"\n input_string = \"\"\n expected = []\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_input_with_empty_lines(self):\n \"\"\"Test input string containing empty lines.\"\"\"\n input_string = \"Line one\\n\\nLine two\\n\"\n expected = [\n os.path.join(\"./hashed_files\", \"209f4c0be3.txt\"),\n os.path.join(\"./hashed_files\", \"1ae5466eb8.txt\"),\n ]\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_no_newline_at_end(self):\n \"\"\"Test input string without a newline at the end.\"\"\"\n input_string = \"Line with no newline at end\"\n expected = [os.path.join(\"./hashed_files\", \"901dd863e9.txt\")]\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_directory_creation(self):\n \"\"\"\n Test if the function creates the directory if it does not exist.\n \"\"\"\n # Assert that the DIRECTORY does not exist before calling the function\n self.assertFalse(os.path.exists(DIRECTORY))\n # Call the function with any string\n f_894(\"Test for directory creation\")\n # Check if the DIRECTORY has been created\n self.assertTrue(os.path.exists(DIRECTORY))\n # Optionally, clean up by removing the created directory after the test\n if os.path.exists(DIRECTORY):\n shutil.rmtree(DIRECTORY)", "apis": ["os.path.exists", "os.path", "os.makedirs", "hashlib.sha256", "os.path.join"], "libs": ["hashlib", "os"], "doc": {"description": ["Hash each non-empty line of a multi-line string using SHA256 and save the hashes to files.", "The filename is the first 10 characters of the hash, with a '.txt' extension.", "Notes:", "- If the DIRECTORY does not exist, it is created.", "- Empty lines in the input string are ignored."], "note": [], "params": ["input_string (str): A multi-line string to be processed."], "returns": ["list[str]: A list of file paths where the hashes of non-empty lines are saved."], "reqs": ["os", "hashlib"], "raises": [], "example": [">>> file_paths = f_894('line a\\nfollows by line b\\n\\n...bye\\n')", ">>> print(file_paths)", "['./hashed_files/489fe1fa6c.txt', './hashed_files/67009597fe.txt', './hashed_files/eab4758603.txt']"]}} -{"task_id": "f_352", "prompt": "import numpy as np\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\n\n\ndef f_352(data, n_components=2, random_state=None):\n \"\"\"\n Performs Principal Component Analysis (PCA) on the provided dataset to reduce its dimensionality,\n and visualizes the results using a scatter plot.\n\n This function applies PCA to the dataset, reducing its features to the specified number of principal components.\n It then visualizes the reduced data in a scatter plot. For datasets reduced to a single component, the function\n generates a 1D scatter plot along the X-axis, with all Y-values set to zero. For reductions resulting in two or more\n components, only the first two principal components are visualized.\n\n Parameters:\n - data (ndarray): A numpy ndarray of shape (n_samples, n_features) representing the data.\n - n_components (int, optional): Number of components to keep. Defaults to 2.\n - random_state (int, optional): Seed for reproducibility. Defaults to None.\n\n Returns:\n dict: A dictionary containing:\n - \"transformed_data\" (np.ndarray): The transformed data.\n - \"ax\" (plt.Axes): The scatter plot visualizing the transformed data.\n\n Requirements:\n - numpy\n - matplotlib\n - sklearn\n\n Example:\n >>> data = np.random.random((100, 5))\n >>> results = f_352(data, random_state=42)\n >>> results['transformed_data'].shape\n (100, 2)\n >>> type(results['ax'])\n \n \"\"\"", "canonical_solution": " pca = PCA(n_components=n_components, random_state=random_state)\n transformed_data = pca.fit_transform(data)\n\n fig, ax = plt.subplots()\n if transformed_data.shape[1] == 1:\n ax.scatter(transformed_data[:, 0], np.zeros_like(transformed_data[:, 0]))\n else:\n ax.scatter(transformed_data[:, 0], transformed_data[:, 1])\n\n return {\"transformed_data\": transformed_data, \"ax\": ax}", "test": "import unittest\nfrom sklearn.decomposition import PCA\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.seed = 42\n self.n = 100\n self.n_dims = 5\n self.n_components = 2\n self.data = np.random.RandomState(self.seed).random((self.n, self.n_dims))\n def assert_pca_correctness(self, data, results, n_components, random_state):\n \"\"\"Helper method to assert PCA correctness\"\"\"\n # 1. Variance explained\n pca = PCA(n_components=n_components, random_state=random_state)\n pca.fit(data)\n explained_variance_ratio = pca.explained_variance_ratio_\n if data.shape[1] == 1:\n # For one-dimensional data, the explained variance ratio should be 1\n self.assertAlmostEqual(explained_variance_ratio[0], 1.0, delta=1e-2)\n else:\n cov_matrix = np.cov(data, rowvar=False)\n eigenvalues = np.linalg.eigvals(cov_matrix)\n sorted_eigenvalues = np.sort(eigenvalues)[::-1][:n_components]\n normalized_eigenvalues = sorted_eigenvalues / sum(eigenvalues)\n self.assertTrue(\n np.allclose(explained_variance_ratio, normalized_eigenvalues, atol=1e-1)\n )\n # 2. Orthogonality\n for i in range(n_components):\n for j in range(i + 1, n_components):\n dot_product = np.dot(\n results[\"transformed_data\"][:, i], results[\"transformed_data\"][:, j]\n )\n self.assertAlmostEqual(dot_product, 0, delta=1e-2)\n def test_case_1(self):\n # Test with default settings\n results = f_352(self.data, random_state=self.seed)\n self.assertEqual(results[\"transformed_data\"].shape, (self.n, self.n_components))\n x_data = results[\"ax\"].collections[0].get_offsets()[:, 0]\n y_data = results[\"ax\"].collections[0].get_offsets()[:, 1]\n self.assertTrue(np.array_equal(x_data, results[\"transformed_data\"][:, 0]))\n self.assertTrue(np.array_equal(y_data, results[\"transformed_data\"][:, 1]))\n self.assert_pca_correctness(self.data, results, self.n_components, self.seed)\n def test_case_2(self):\n # Test n_components\n for n_components in [1, 2, min(self.data.shape)]:\n results = f_352(self.data, n_components=n_components, random_state=42)\n self.assertEqual(results[\"transformed_data\"].shape[1], n_components)\n self.assert_pca_correctness(self.data, results, n_components, self.seed)\n def test_case_3(self):\n # Test when one of the features has zero variance\n data = self.data.copy()\n data[:, 1] = 0 # Second feature has zero variance\n results = f_352(data, n_components=2, random_state=self.seed)\n self.assertEqual(results[\"transformed_data\"].shape, (100, 2))\n self.assert_pca_correctness(data, results, 2, self.seed)\n def test_case_4(self):\n # Test with n_components greater than min(n_samples, n_features)\n data = np.random.RandomState(self.seed).randn(10, 2)\n with self.assertRaises(ValueError):\n f_352(data, n_components=3, random_state=self.seed)\n def test_case_5(self):\n # Test with a single sample\n data = np.random.RandomState(self.seed).randn(1, self.n_dims)\n with self.assertRaises(ValueError):\n f_352(data)\n def test_case_6(self):\n # Edge case - test when dataset contains NaN\n data = self.data.copy()\n data[0, 0] = np.nan # Introduce a NaN value\n with self.assertRaises(ValueError):\n f_352(data, n_components=2, random_state=self.seed)\n def test_case_7(self):\n # Edge case - test when dataset contains infinite values\n data = self.data.copy()\n data[0, 0] = np.inf # Introduce an infinite value\n with self.assertRaises(ValueError):\n f_352(data, n_components=2, random_state=self.seed)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "sklearn.decomposition.PCA", "numpy.zeros_like"], "libs": ["matplotlib", "numpy", "sklearn"], "doc": {"description": ["Performs Principal Component Analysis (PCA) on the provided dataset to reduce its dimensionality,", "and visualizes the results using a scatter plot.", "This function applies PCA to the dataset, reducing its features to the specified number of principal components.", "It then visualizes the reduced data in a scatter plot. For datasets reduced to a single component, the function", "generates a 1D scatter plot along the X-axis, with all Y-values set to zero. For reductions resulting in two or more", "components, only the first two principal components are visualized."], "note": [], "params": ["data (ndarray): A numpy ndarray of shape (n_samples, n_features) representing the data.", "n_components (int, optional): Number of components to keep. Defaults to 2.", "random_state (int, optional): Seed for reproducibility. Defaults to None."], "returns": ["dict: A dictionary containing:", "\"transformed_data\" (np.ndarray): The transformed data.", "\"ax\" (plt.Axes): The scatter plot visualizing the transformed data."], "reqs": ["numpy", "matplotlib", "sklearn"], "raises": [], "example": [">>> data = np.random.random((100, 5))", ">>> results = f_352(data, random_state=42)", ">>> results['transformed_data'].shape", "(100, 2)", ">>> type(results['ax'])", ""]}} -{"task_id": "f_793", "prompt": "import numpy as np\nfrom scipy.linalg import svd\n\ndef f_793(rows=3, columns=2, seed=0):\n \"\"\"\n Generate a matrix of random values with specified dimensions and perform Singular Value Decomposition (SVD) on it.\n\n Requirements:\n - numpy\n - scipy.linalg.svd\n\n Parameters:\n - rows (int): Number of rows for the random matrix. Default is 3.\n - columns (int): Number of columns for the random matrix. Default is 2.\n - seed (int, optional): Seed for the random number generator to ensure reproducibility. Default is None.\n\n Returns:\n tuple: A tuple containing three elements:\n - U (ndarray): The unitary matrix U.\n - s (ndarray): The singular values, sorted in descending order.\n - Vh (ndarray): The conjugate transpose of the unitary matrix V.\n\n Example:\n >>> U, s, Vh = f_793(3, 2, seed=42)\n >>> print('U shape:', U.shape)\n U shape: (3, 3)\n >>> print('s shape:', s.shape)\n s shape: (2,)\n >>> print('Vh shape:', Vh.shape)\n Vh shape: (2, 2)\n \"\"\"", "canonical_solution": " np.random.seed(seed)\n matrix = np.random.rand(rows, columns)\n U, s, Vh = svd(matrix)\n\n return U, s, Vh", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n # Test with default 3x2 matrix\n U, s, Vh = f_793(seed=3)\n self.assertEqual(U.shape, (3, 3))\n self.assertEqual(s.shape, (2,))\n self.assertEqual(Vh.shape, (2, 2))\n self.assertTrue(np.all(s >= 0))\n \n def test_case_2(self):\n # Test with a 5x5 square matrix\n U, s, Vh = f_793(5, 5, seed=42)\n self.assertEqual(U.shape, (5, 5))\n self.assertEqual(s.shape, (5,))\n self.assertEqual(Vh.shape, (5, 5))\n self.assertTrue(np.all(s >= 0))\n \n def test_case_3(self):\n # Test with a 2x3 matrix (more columns than rows)\n U, s, Vh = f_793(2, 3, seed=12)\n self.assertEqual(U.shape, (2, 2))\n self.assertEqual(s.shape, (2,))\n self.assertEqual(Vh.shape, (3, 3))\n self.assertTrue(np.all(s >= 0))\n \n def test_case_4(self):\n # Test with a 1x1 matrix (a scalar)\n U, s, Vh = f_793(1, 1, seed=0)\n self.assertEqual(U.shape, (1, 1))\n self.assertEqual(s.shape, (1,))\n self.assertEqual(Vh.shape, (1, 1))\n self.assertTrue(np.all(s >= 0))\n \n def test_case_5(self):\n # Test with a 4x3 matrix\n U, s, Vh = f_793(4, 3, seed=1)\n self.assertEqual(U.shape, (4, 4))\n self.assertEqual(s.shape, (3,))\n self.assertEqual(Vh.shape, (3, 3))\n self.assertTrue(np.all(s >= 0))", "apis": ["numpy.random", "scipy.linalg.svd", "numpy.random.rand", "numpy.random.seed"], "libs": ["numpy", "scipy"], "doc": {"description": ["Generate a matrix of random values with specified dimensions and perform Singular Value Decomposition (SVD) on it."], "note": [], "params": ["rows (int): Number of rows for the random matrix. Default is 3.", "columns (int): Number of columns for the random matrix. Default is 2.", "seed (int, optional): Seed for the random number generator to ensure reproducibility. Default is None."], "returns": ["tuple: A tuple containing three elements:", "U (ndarray): The unitary matrix U.", "s (ndarray): The singular values, sorted in descending order.", "Vh (ndarray): The conjugate transpose of the unitary matrix V."], "reqs": ["numpy", "scipy.linalg.svd"], "raises": [], "example": [">>> U, s, Vh = f_793(3, 2, seed=42)", ">>> print('U shape:', U.shape)", "U shape: (3, 3)", ">>> print('s shape:', s.shape)", "s shape: (2,)", ">>> print('Vh shape:', Vh.shape)", "Vh shape: (2, 2)"]}} -{"task_id": "f_772", "prompt": "import random\nimport string\n\ndef f_772(word):\n \"\"\"\n Generates a list of random pairs of adjacent letters from the given word. The number of such pairs will be equal to the length of the constant POSSIBLE_LETTERS.\n \n Parameters:\n word (str): The input string. Must only contain letters.\n \n Returns:\n list: A list of random pairs of adjacent letters from the word. If the word has fewer than 2 letters, returns a list of empty strings based on POSSIBLE_LETTERS length.\n \n Examples:\n >>> random.seed(0); f_772('abcdef')\n ['ab', 'bc', 'cd']\n >>> random.seed(0); f_772('xyz')\n ['xy', 'xy', 'yz']\n \"\"\"", "canonical_solution": " if not all(char in string.ascii_letters for char in word):\n raise ValueError(\"Input must only contain letters.\")\n \n if len(word) < 2:\n return ['' for _ in range(len(['a', 'b', 'c']))]\n \n pairs = [''.join(x) for x in zip(word, word[1:])]\n random_pairs = [random.choice(pairs) for _ in range(len(['a', 'b', 'c']))]\n\n return random_pairs", "test": "import unittest\nimport random\n# Assuming the function is correctly imported from its script\n# from f_772 import f_772 \nclass TestCases(unittest.TestCase):\n def test_with_valid_input(self):\n random.seed(0)\n result = f_772('abcdef')\n self.assertEqual(len(result), 3, \"Output list should have length 3\")\n valid_pairs = ['ab', 'bc', 'cd', 'de', 'ef']\n for pair in result:\n self.assertIn(pair, valid_pairs, f\"Pair '{pair}' is not a valid adjacent pair in 'abcdef'\")\n def test_single_character(self):\n random.seed(42)\n result = f_772('a')\n expected = ['', '', '']\n self.assertEqual(result, expected, \"Should return list of empty strings for a single character\")\n def test_empty_string(self):\n random.seed(55)\n result = f_772('')\n expected = ['', '', '']\n self.assertEqual(result, expected, \"Should return list of empty strings for an empty string\")\n def test_non_letter_input(self):\n random.seed(0)\n with self.assertRaises(ValueError):\n f_772('123')\n def test_long_input(self):\n random.seed(5)\n result = f_772('abcdefghijklmnopqrstuvwxyz')\n all_pairs = [''.join(x) for x in zip('abcdefghijklmnopqrstuvwxyz', 'abcdefghijklmnopqrstuvwxyz'[1:])]\n for pair in result:\n self.assertIn(pair, all_pairs, f\"Pair '{pair}' is not a valid adjacent pair in the alphabet\")", "apis": ["random.choice", "string.ascii_letters"], "libs": ["string", "random"], "doc": {"description": ["Generates a list of random pairs of adjacent letters from the given word. The number of such pairs will be equal to the length of the constant POSSIBLE_LETTERS."], "note": [], "params": ["word (str): The input string. Must only contain letters."], "returns": ["list: A list of random pairs of adjacent letters from the word. If the word has fewer than 2 letters, returns a list of empty strings based on POSSIBLE_LETTERS length."], "reqs": [], "raises": [], "example": ["Examples:", ">>> random.seed(0); f_772('abcdef')", "['ab', 'bc', 'cd']", ">>> random.seed(0); f_772('xyz')", "['xy', 'xy', 'yz']"]}} -{"task_id": "f_919", "prompt": "import datetime\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Constants\nTIME_FORMAT = \"%d/%m/%y %H:%M:%S.%f\"\n\n\ndef f_919(time_strings):\n \"\"\"\n Compute the differences in seconds between consecutive datetime strings and plot these differences as a bar chart.\n\n Parameters:\n - time_strings (list of str): A list of datetime strings in the format 'dd/mm/yy HH:MM:SS.fff'.\n\n Returns:\n - matplotlib.axes.Axes: The axes object of the plotted bar chart. This object allows further customization of the plot outside this function.\n\n Requirements:\n - datetime\n - numpy\n - matplotlib\n\n Note:\n - The function requires the datetime, numpy, and matplotlib.pyplot modules.\n - The datetime strings in the input list should follow the specific format specified in TIME_FORMAT.\n - The function calculates the time differences between each pair of consecutive datetime strings in the list.\n\n Example:\n >>> time_strings = ['30/03/09 16:31:32.123', '30/03/09 16:32:33.123', '30/03/09 16:33:34.123']\n >>> ax = f_919(time_strings)\n >>> plt.show() # This will display the bar chart\n \"\"\"", "canonical_solution": " # Calculate time differences\n differences = (\n np.diff([datetime.datetime.strptime(t, TIME_FORMAT) for t in time_strings])\n .astype(\"timedelta64[s]\")\n .astype(int)\n )\n\n # Plotting the bar chart\n _ = plt.bar(range(len(differences)), differences)\n plt.xlabel(\"Index\")\n plt.ylabel(\"Time Difference (seconds)\")\n plt.title(\"Time Differences Between Consecutive Timestamps\")\n return plt.gca()", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_919\"\"\"\n def test_regular_time_strings(self):\n \"\"\"Test Regular Time Strings with 1-second difference\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:31:33.123\",\n \"30/03/09 16:31:34.123\",\n ]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n self.assertEqual(bar_heights, [1.0, 1.0])\n def test_different_time_units(self):\n \"\"\"Test Time Strings with Different Day, Hour, Minute, and Second Differences\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"31/03/09 17:32:33.123\",\n \"01/04/09 18:33:34.123\",\n ]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n expected_diffs = [(86400 + 3600 + 60 + 1), (86400 + 3600 + 60 + 1)]\n self.assertEqual(bar_heights, expected_diffs)\n def test_millisecond_difference(self):\n \"\"\"Test Time Strings with Millisecond Differences\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:31:32.623\",\n \"30/03/09 16:31:33.123\",\n ]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n self.assertEqual(bar_heights, [0, 0])\n def test_no_difference(self):\n \"\"\"Test Time Strings with No Difference\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:31:32.123\",\n ]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n self.assertEqual(bar_heights, [0.0, 0.0])\n def test_large_list(self):\n \"\"\"Test Large List of Time Strings with Constant 1-second Difference\"\"\"\n time_strings = [\"30/03/09 16:31:\" + f\"{i:02}.123\" for i in range(30, 40)]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n self.assertEqual(bar_heights, [1.0] * 9)", "apis": ["numpy.diff", "matplotlib.pyplot.xlabel", "matplotlib.pyplot.title", "datetime.datetime.strptime", "matplotlib.pyplot.bar", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "datetime.datetime"], "libs": ["numpy", "matplotlib", "datetime"], "doc": {"description": ["Compute the differences in seconds between consecutive datetime strings and plot these differences as a bar chart."], "note": ["The function requires the datetime, numpy, and matplotlib.pyplot modules.", "The datetime strings in the input list should follow the specific format specified in TIME_FORMAT.", "The function calculates the time differences between each pair of consecutive datetime strings in the list."], "params": ["time_strings (list of str): A list of datetime strings in the format 'dd/mm/yy HH:MM:SS.fff'."], "returns": ["matplotlib.axes.Axes: The axes object of the plotted bar chart. This object allows further customization of the plot outside this function."], "reqs": ["datetime", "numpy", "matplotlib"], "raises": [], "example": [">>> time_strings = ['30/03/09 16:31:32.123', '30/03/09 16:32:33.123', '30/03/09 16:33:34.123']", ">>> ax = f_919(time_strings)", ">>> plt.show() # This will display the bar chart"]}} -{"task_id": "f_355", "prompt": "from scipy.spatial.distance import cdist\nfrom sklearn.datasets import make_blobs\nimport matplotlib.pyplot as plt\n\n\ndef f_355(n_samples=200, centers=4, plot_path=None, random_seed=None):\n \"\"\"\n Generate a synthetic 2D dataset using make_blobs, visualize the dataset, and then calculate\n the Euclidean distance between individual samples of the dataset.\n\n Parameters:\n - n_samples (int): Number of samples to generate. Default is 200.\n - centers (int): Number of centers to generate. Default is 4.\n - plot_path (str, optional): Path to save the plot. If None, the plot will be returned.\n - random_seed (int, optional): Seed for random number generation. Default is None.\n\n Returns:\n - tuple:\n - ndarray: A 2D array with distances between each sample.\n - Axes or None: If plot_path is None, returns the matplotlib Axes object of the plot.\n Otherwise, saves the plot to the provided path and return None.\n Plot shows values of the first feature dimension on the x-axis, values\n of the second feature dimension on the y-axis, and labels of the synthetic\n examples as color.\n\n Requirements:\n - scipy.spatial.distance.cdist\n - sklearn.datasets.make_blobs\n - matplotlib.pyplot\n\n Example:\n >>> distances, plot = f_355(random_seed=42)\n >>> distances.shape\n (200, 200)\n >>> plot\n \n \"\"\"", "canonical_solution": " X, y = make_blobs(\n n_samples=n_samples,\n n_features=2,\n centers=centers,\n random_state=random_seed,\n )\n\n fig, ax = plt.subplots()\n\n ax.scatter(X[:, 0], X[:, 1], c=y)\n\n if plot_path:\n plt.savefig(plot_path)\n plt.close(fig)\n return cdist(X, X), None\n\n return cdist(X, X), ax", "test": "import unittest\nimport tempfile\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.seed = 42\n self.temp_dir = tempfile.TemporaryDirectory()\n def test_case_1(self):\n # Default parameters\n distances, plot = f_355()\n self.assertEqual(distances.shape, (200, 200))\n self.assertEqual(len(plot.collections[0].get_offsets()), 200)\n self.assertEqual(len(set(plot.collections[0].get_array())), 4)\n def test_case_2(self):\n # Custom parameters\n n_samples, centers = 50, 5\n distances, plot = f_355(\n random_seed=self.seed, n_samples=n_samples, centers=centers\n )\n self.assertEqual(distances.shape, (n_samples, n_samples))\n self.assertEqual(len(plot.collections[0].get_offsets()), n_samples)\n self.assertEqual(len(set(plot.collections[0].get_array())), centers)\n def test_case_3(self):\n # Saving the plot to a path\n plot_path = os.path.join(self.temp_dir.name, \"test_plot.png\")\n distances, plot = f_355(random_seed=self.seed, plot_path=plot_path)\n self.assertEqual(distances.shape, (200, 200))\n self.assertTrue(os.path.exists(plot_path))\n self.assertIsNone(plot)\n def test_case_4(self):\n # Test reproducibility with the same seed\n distances1, _ = f_355(random_seed=self.seed)\n distances2, _ = f_355(random_seed=self.seed)\n np.testing.assert_array_equal(distances1, distances2)\n # Test different outputs with different seeds\n distances3, _ = f_355(random_seed=43)\n with self.assertRaises(AssertionError):\n np.testing.assert_array_equal(distances1, distances3)\n def test_case_5(self):\n # Test negative parameters for n_samples\n with self.assertRaises(ValueError):\n f_355(n_samples=-100, random_seed=self.seed)\n def test_case_6(self):\n # Test non-integer inputs for n_samples\n with self.assertRaises(TypeError):\n f_355(n_samples=200.5, random_seed=self.seed)\n def tearDown(self):\n plt.close(\"all\")\n self.temp_dir.cleanup()", "apis": ["sklearn.datasets.make_blobs", "matplotlib.pyplot.close", "matplotlib.pyplot.savefig", "scipy.spatial.distance.cdist", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "sklearn", "scipy"], "doc": {"description": ["Generate a synthetic 2D dataset using make_blobs, visualize the dataset, and then calculate", "the Euclidean distance between individual samples of the dataset."], "note": [], "params": ["n_samples (int): Number of samples to generate. Default is 200.", "centers (int): Number of centers to generate. Default is 4.", "plot_path (str, optional): Path to save the plot. If None, the plot will be returned.", "random_seed (int, optional): Seed for random number generation. Default is None."], "returns": ["tuple:", "ndarray: A 2D array with distances between each sample.", "Axes or None: If plot_path is None, returns the matplotlib Axes object of the plot.", "Otherwise, saves the plot to the provided path and return None.", "Plot shows values of the first feature dimension on the x-axis, values", "of the second feature dimension on the y-axis, and labels of the synthetic", "examples as color."], "reqs": ["scipy.spatial.distance.cdist", "sklearn.datasets.make_blobs", "matplotlib.pyplot"], "raises": [], "example": [">>> distances, plot = f_355(random_seed=42)", ">>> distances.shape", "(200, 200)", ">>> plot", ""]}} -{"task_id": "f_910", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n# Constants\nNUM_SAMPLES = 100\nNUM_OUTLIERS = 5\n\n\ndef f_910(num_samples=NUM_SAMPLES, num_outliers=NUM_OUTLIERS):\n \"\"\"\n Generate a dataset comprising both normal data and artificially introduced outliers,\n and plot a histogram of the combined data. The function detects outliers in the dataset\n using the Interquartile Range (IQR) method, but it only considers the normally distributed\n portion of the data for outlier detection. The outliers detected and the artificially\n introduced outliers might not always coincide.\n\n Parameters:\n - num_samples (int): Number of samples to be drawn from a normal distribution. The default \n value is 100. If set to zero or a negative number, no normal data will be generated, \n and the dataset will only contain artificially introduced outliers.\n - num_outliers (int): Number of outliers to be artificially introduced into the dataset. \n These outliers are uniformly distributed between -10 and 10. The default value is 5. \n If set to zero, no outliers will be artificially introduced.\n\n\n Returns:\n - data (numpy array): The combined dataset, including both normally distributed data and \n the artificially introduced outliers.\n - outliers_detected (numpy array): The outliers detected using the IQR method. This \n detection is based solely on the normally distributed portion of the data.\n - ax (matplotlib.axes._subplots.AxesSubplot): The AxesSubplot object for the histogram \n plot of the combined dataset.\n\n Requirements:\n - numpy\n - matplotlib\n\n Note:\n - The artificially introduced outliers are not necessarily the same as the outliers\n detected by the IQR method. The IQR method is applied only to the normally distributed\n data, and thus some of the artificially introduced outliers may not be detected,\n and some normal data points may be falsely identified as outliers.\n\n Example:\n >>> import numpy as np\n >>> np.random.seed(0)\n >>> data, outliers_detected, ax = f_910()\n >>> print(outliers_detected)\n [-9.61613603 -3.96850367 3.20347075]\n \"\"\"", "canonical_solution": " normal_data = np.random.normal(size=num_samples)\n outliers = np.random.uniform(low=-10, high=10, size=num_outliers)\n data = np.concatenate([normal_data, outliers]) if num_samples > 0 else outliers\n\n # Identify outliers using IQR (only if there is normal data)\n outliers_detected = np.array([])\n if num_samples > 0:\n q75, q25 = np.percentile(normal_data, [75, 25])\n iqr = q75 - q25\n lower_bound = q25 - (iqr * 1.5)\n upper_bound = q75 + (iqr * 1.5)\n outliers_detected = data[(data < lower_bound) | (data > upper_bound)]\n\n # Plot histogram\n _, ax = plt.subplots()\n ax.hist(data, bins=30)\n\n return data, outliers_detected, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_910.\"\"\"\n def test_default_values(self):\n \"\"\"Test the function with default values.\"\"\"\n np.random.seed(0)\n data, _, _ = f_910()\n self.assertEqual(len(data), 105)\n def test_custom_values(self):\n \"\"\"Test the function with custom values.\"\"\"\n np.random.seed(1)\n data, outliers_detected, _ = f_910(num_samples=50, num_outliers=10)\n self.assertEqual(len(data), 60)\n # Replicate the IQR calculation for testing\n normal_data = data[:50] # Assuming the first 50 are normal data\n q75, q25 = np.percentile(normal_data, [75, 25])\n iqr = q75 - q25\n lower_bound = q25 - (iqr * 1.5)\n upper_bound = q75 + (iqr * 1.5)\n expected_outliers_count = len(\n [o for o in data if o < lower_bound or o > upper_bound]\n )\n self.assertEqual(len(outliers_detected), expected_outliers_count)\n def test_no_outliers(self):\n \"\"\"Test the function with no outliers.\"\"\"\n np.random.seed(2)\n data, outliers_detected, ax = f_910(num_samples=100, num_outliers=0)\n self.assertEqual(len(data), 100)\n # Adjust the expectation to consider possible false positives\n self.assertTrue(len(outliers_detected) <= 1) # Allow for up to 1 false positive\n def test_only_outliers(self):\n \"\"\"Test the function with only outliers.\"\"\"\n np.random.seed(3)\n data, outliers_detected, _ = f_910(num_samples=0, num_outliers=100)\n self.assertEqual(len(data), 100)\n # Since no normal data is generated, IQR is not applied, and no outliers are detected.\n self.assertEqual(len(outliers_detected), 0)\n def test_negative_values(self):\n \"\"\"Test the function with negative values.\"\"\"\n np.random.seed(4)\n with self.assertRaises(ValueError):\n f_910(num_samples=-10, num_outliers=-5)\n def tearDown(self):\n plt.close()", "apis": ["numpy.random", "numpy.random.uniform", "numpy.percentile", "numpy.random.normal", "numpy.array", "matplotlib.pyplot.subplots", "numpy.concatenate"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Generate a dataset comprising both normal data and artificially introduced outliers,", "and plot a histogram of the combined data. The function detects outliers in the dataset", "using the Interquartile Range (IQR) method, but it only considers the normally distributed", "portion of the data for outlier detection. The outliers detected and the artificially", "introduced outliers might not always coincide."], "note": ["The artificially introduced outliers are not necessarily the same as the outliers", "detected by the IQR method. The IQR method is applied only to the normally distributed", "data, and thus some of the artificially introduced outliers may not be detected,", "and some normal data points may be falsely identified as outliers."], "params": ["num_samples (int): Number of samples to be drawn from a normal distribution. The default", "value is 100. If set to zero or a negative number, no normal data will be generated,", "and the dataset will only contain artificially introduced outliers.", "num_outliers (int): Number of outliers to be artificially introduced into the dataset.", "These outliers are uniformly distributed between -10 and 10. The default value is 5.", "If set to zero, no outliers will be artificially introduced."], "returns": ["data (numpy array): The combined dataset, including both normally distributed data and", "the artificially introduced outliers.", "outliers_detected (numpy array): The outliers detected using the IQR method. This", "detection is based solely on the normally distributed portion of the data.", "ax (matplotlib.axes._subplots.AxesSubplot): The AxesSubplot object for the histogram", "plot of the combined dataset."], "reqs": ["numpy", "matplotlib"], "raises": [], "example": [">>> import numpy as np", ">>> np.random.seed(0)", ">>> data, outliers_detected, ax = f_910()", ">>> print(outliers_detected)", "[-9.61613603 -3.96850367 3.20347075]"]}} -{"task_id": "f_849", "prompt": "import os\nimport requests\nfrom zipfile import ZipFile, BadZipFile\n\n\ndef f_849(url, download_path=\"mnt/data/downloads/\"):\n \"\"\"\n Downloads and extracts a ZIP file from a specified URL to a given directory.\n\n Parameters:\n - url (str): The URL from which to download the ZIP file. It should be a valid and accessible URL.\n - download_path (str): The directory path where the ZIP file will be downloaded and extracted.\n Defaults to \"mnt/data/downloads/\".\n\n Returns:\n - str: Path to the directory containing the extracted contents. If an error occurs, a descriptive\n message detailing the type of error is returned.\n\n Raises:\n - Network Issues or Invalid URL: Returns \"Error: Unable to download the file from the provided URL.\"\n if there are issues in reaching the URL or downloading the file.\n - Incorrect File Type: Returns \"Error: The URL does not point to a ZIP file.\" if the downloaded file's\n content type is not 'application/zip'.\n - Corrupt ZIP File: Returns \"Error: The downloaded file is not a valid ZIP file.\" if the downloaded file\n is a ZIP file but is corrupt or cannot be extracted.\n - General Exceptions: Catches and reports any other exceptions (like runtime errors) that occur during\n the process with a specific error message, formatted as \"Error: [exception message]\".\n\n\n Requirements:\n - requests\n - os\n - zipfile\n\n Example:\n >>> f_849('https://example.com/file.zip')\n 'mnt/data/downloads/file'\n \"\"\"", "canonical_solution": " if not os.path.exists(download_path):\n os.makedirs(download_path)\n\n try:\n response = requests.get(url, timeout=5)\n response.raise_for_status()\n\n # Verify content type\n if \"application/zip\" not in response.headers.get(\"Content-Type\", \"\"):\n return \"Error: The URL does not point to a ZIP file.\"\n\n file_name = os.path.join(download_path, os.path.basename(url))\n\n with open(file_name, \"wb\") as f:\n f.write(response.content)\n\n extract_path = os.path.splitext(file_name)[0]\n\n if not os.path.exists(extract_path):\n os.makedirs(extract_path)\n\n with ZipFile(file_name, \"r\") as zip_ref:\n zip_ref.extractall(extract_path)\n\n return extract_path\n\n except requests.RequestException:\n return \"Error: Unable to download the file from the provided URL.\"\n except BadZipFile:\n return \"Error: The downloaded file is not a valid ZIP file.\"\n except RuntimeError as e:\n return f\"Error: {str(e)}\"", "test": "import unittest\nfrom unittest.mock import patch\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_849.\"\"\"\n def test_valid_zip_url(self):\n \"\"\"Test a valid ZIP URL.\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-1.zip\"\n result = f_849(url)\n self.assertTrue(result.startswith(\"mnt/data/downloads/\"))\n self.assertTrue(result.endswith(\"sample-1\"))\n shutil.rmtree(\"mnt/data/downloads\")\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"Test an invalid URL.\"\"\"\n mock_get.side_effect = requests.RequestException()\n url = \"https://invalid-url.com/sample.zip\"\n result = f_849(url)\n self.assertEqual(\n result,\n \"Error: Unable to download the file from the provided URL.\",\n )\n @patch(\"requests.get\")\n def test_non_zip_content(self, mock_get):\n \"\"\"Test a URL that does not point to a ZIP file.\"\"\"\n mock_get.return_value.status_code = 200\n mock_get.return_value.headers = {\"Content-Type\": \"text/plain\"}\n mock_get.return_value.content = b\"Not a ZIP file\"\n url = \"https://valid-url.com/not-a-zip.txt\"\n result = f_849(url)\n self.assertEqual(result, \"Error: The URL does not point to a ZIP file.\")\n @patch(\"requests.get\")\n def test_download_invald_zip_file(self, mock_get):\n \"\"\"Test a URL that points to a ZIP file, but the file is invalid.\"\"\"\n mock_get.return_value.status_code = 200\n mock_get.return_value.headers = {\"Content-Type\": \"application/zip\"}\n mock_get.return_value.content = b\"Some ZIP content\"\n url = \"https://valid-zip-url.com/sample.zip\"\n custom_path = \"mnt/data/custom_path/\"\n result = f_849(url, custom_path)\n self.assertEqual(result, \"Error: The downloaded file is not a valid ZIP file.\")\n @patch(\"requests.get\")\n def test_general_error(self, mock_get):\n \"\"\"Test a general error.\"\"\"\n mock_get.side_effect = RuntimeError(\"Unexpected error\")\n url = \"https://error-url.com/error.zip\"\n result = f_849(url)\n self.assertTrue(result.startswith(\"Error: Unexpected error\"))\n @classmethod\n def tearDownClass(cls):\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["os.path.exists", "os.path.splitext", "zipfile.ZipFile", "os.path.basename", "os.path", "requests.get", "requests.RequestException", "os.makedirs", "os.path.join"], "libs": ["requests", "zipfile", "os"], "doc": {"description": ["Downloads and extracts a ZIP file from a specified URL to a given directory."], "note": [], "params": ["url (str): The URL from which to download the ZIP file. It should be a valid and accessible URL.", "download_path (str): The directory path where the ZIP file will be downloaded and extracted.", "Defaults to \"mnt/data/downloads/\"."], "returns": ["str: Path to the directory containing the extracted contents. If an error occurs, a descriptive", "message detailing the type of error is returned."], "reqs": ["requests", "os", "zipfile"], "raises": ["Network Issues or Invalid URL: Returns \"Error: Unable to download the file from the provided URL.\"", "if there are issues in reaching the URL or downloading the file.", "Incorrect File Type: Returns \"Error: The URL does not point to a ZIP file.\" if the downloaded file's", "content type is not 'application/zip'.", "Corrupt ZIP File: Returns \"Error: The downloaded file is not a valid ZIP file.\" if the downloaded file", "is a ZIP file but is corrupt or cannot be extracted.", "General Exceptions: Catches and reports any other exceptions (like runtime errors) that occur during", "the process with a specific error message, formatted as \"Error: [exception message]\"."], "example": [">>> f_849('https://example.com/file.zip')", "'mnt/data/downloads/file'"]}} -{"task_id": "f_565", "prompt": "import numpy as np\nfrom sklearn.decomposition import PCA\n\ndef f_565(tuples_list, n_components):\n \"\"\"\n Perform Principal Component Analysis (PCA) on a list of tuples.\n \n Parameters:\n - tuples_list (list): The list of tuples.\n \n Returns:\n - transformed_data (ndarray): The transformed data.\n\n Requirements:\n - numpy\n - sklearn\n \n Example:\n >>> data = f_565([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], 2)\n >>> print(data)\n [[ 8.00000000e+00 3.84592537e-16]\n [ 0.00000000e+00 0.00000000e+00]\n [-8.00000000e+00 3.84592537e-16]]\n \"\"\"", "canonical_solution": " data = np.array(tuples_list)\n pca = PCA(n_components=n_components)\n transformed_data = pca.fit_transform(data)\n\n return transformed_data", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n transformed_data = f_565([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], 2)\n self.assertEqual(transformed_data.shape, (3, 2))\n def test_case_2(self):\n transformed_data = f_565([(0, 0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0)], 2)\n self.assertEqual(transformed_data.shape, (3, 2))\n self.assertTrue(np.all(transformed_data == 0))\n def test_case_3(self):\n transformed_data = f_565([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], 3)\n self.assertEqual(transformed_data.shape, (3, 3))\n def test_case_4(self):\n transformed_data = f_565([(0, 1)], 1)\n self.assertEqual(transformed_data.shape, (1, 1))\n self.assertTrue(np.all(transformed_data == 0))\n def test_case_5(self):\n transformed_data = f_565([(-1, -1, -1), (0, 0, 0), (1, 1, 1)], 1)\n self.assertEqual(transformed_data.shape, (3, 1))\n self.assertTrue(transformed_data[0][0] < 0)\n self.assertTrue(transformed_data[1][0] == 0)\n self.assertTrue(transformed_data[2][0] > 0)", "apis": ["numpy.array", "sklearn.decomposition.PCA"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Perform Principal Component Analysis (PCA) on a list of tuples."], "note": [], "params": ["tuples_list (list): The list of tuples."], "returns": ["transformed_data (ndarray): The transformed data."], "reqs": ["numpy", "sklearn"], "raises": [], "example": [">>> data = f_565([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], 2)", ">>> print(data)", "[[ 8.00000000e+00 3.84592537e-16]", "[ 0.00000000e+00 0.00000000e+00]", "[-8.00000000e+00 3.84592537e-16]]"]}} -{"task_id": "f_807", "prompt": "import os\nfrom pathlib import Path\nimport pandas as pd\nimport docx\n\n\ndef f_807(source_directory: str, target_directory: str) -> int:\n \"\"\"\n Converts files with specific extensions (.txt, .docx, .xlsx, .csv) from a source directory to CSV files\n and saves them in a target directory.\n\n Parameters:\n - source_directory (str): The path to the source directory containing the files to be converted.\n - target_directory (str): The path to the target directory where the converted CSV files will be saved.\n If it does not exist, the function will create it.\n\n Returns:\n - int: The number of files successfully converted to CSV.\n\n Raises:\n - FileNotFoundError: If the source directory does not exist.\n\n Requirements:\n - os\n - pathlib\n - pandas\n - python-docx\n - openpyxl\n\n Notes:\n - Each file's text content is captured and stored in a CSV with a single 'Text' column and no row indices.\n - This function will overwrite existing files in the target directory if they have the same names as the\n converted files.\n\n Example:\n >>> f_807('/Users/test/Documents', '/Users/test/Documents/csv_files')\n 4\n >>> f_807('/path/to/source', '/path/to/target')\n 2\n \"\"\"", "canonical_solution": " converted_files = 0\n extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n\n if not os.path.exists(source_directory):\n raise FileNotFoundError(\"source_directory must exist.\")\n if not os.path.exists(target_directory):\n os.makedirs(target_directory, exist_ok=True)\n\n for root, dirs, files in os.walk(source_directory):\n for file in files:\n extension = Path(file).suffix\n if extension in extensions:\n filepath = os.path.join(root, file)\n target_filepath = os.path.join(\n target_directory, Path(file).stem + \".csv\"\n )\n if extension == \".csv\":\n df = pd.read_csv(filepath)\n elif extension == \".xlsx\":\n df = pd.read_excel(filepath, engine=\"openpyxl\")\n elif extension == \".docx\":\n doc = docx.Document(filepath)\n data = [p.text for p in doc.paragraphs]\n df = pd.DataFrame({\"Text\": data})\n elif extension == \".txt\":\n with open(filepath, \"r\") as f:\n data = f.readlines()\n df = pd.DataFrame({\"Text\": data})\n\n df.to_csv(target_filepath, index=False)\n converted_files += 1\n\n return converted_files", "test": "import unittest\nimport os\nimport docx\nimport pandas as pd\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_source_dir = tempfile.TemporaryDirectory()\n self.temp_target_dir = tempfile.TemporaryDirectory()\n self.source_dir = self.temp_source_dir.name\n self.target_dir = self.temp_target_dir.name\n self.test_texts = [\"Hello, world!\"] * 10\n self.test_df = pd.DataFrame(\n {\"A\": list(range(10)), \"B\": [str(_) for _ in range(10)]}\n )\n def tearDown(self):\n self.temp_source_dir.cleanup()\n self.temp_target_dir.cleanup()\n def create_test_data(self, extension):\n filename = \"sample\" + extension\n path = os.path.join(self.source_dir, filename)\n if extension == \".txt\":\n with open(path, \"w\") as f:\n for text in self.test_texts:\n f.write(text + \"\\n\")\n elif extension == \".docx\":\n doc = docx.Document()\n for text in self.test_texts:\n doc.add_paragraph(text)\n doc.save(path)\n elif extension == \".csv\":\n self.test_df.to_csv(path, index=False)\n elif extension == \".xlsx\":\n self.test_df.to_excel(path, index=False)\n def test_case_1(self):\n # Test txt\n self.create_test_data(\".txt\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)\n converted_path = os.path.join(self.target_dir, \"sample.csv\")\n self.assertTrue(os.path.exists(converted_path))\n def test_case_2(self):\n # Test docx\n self.create_test_data(\".docx\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)\n self.assertTrue(os.path.exists(os.path.join(self.target_dir, \"sample.csv\")))\n def test_case_3(self):\n # Test xlsx\n self.create_test_data(\".xlsx\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)\n self.assertTrue(os.path.exists(os.path.join(self.target_dir, \"sample.csv\")))\n def test_case_4(self):\n # Test csv\n self.create_test_data(\".csv\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)\n self.assertTrue(os.path.exists(os.path.join(self.target_dir, \"sample.csv\")))\n def test_case_5(self):\n # Ensure function handles directories without convertible files\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 0)\n def test_case_6(self):\n # Test with a source directory that does not exist\n non_existent_dir = \"/path/does/not/exist\"\n with self.assertRaises(FileNotFoundError):\n f_807(non_existent_dir, self.target_dir)\n def test_case_7(self):\n # Ensure function does not convert unsupported file types\n unsupported_path = os.path.join(self.source_dir, \"unsupported.pdf\")\n open(unsupported_path, \"a\").close()\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 0)\n def test_case_8(self):\n # Create multiple files of supported types and verify they all get converted\n for ext in [\".txt\", \".docx\", \".xlsx\", \".csv\"]:\n self.create_test_data(ext)\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 4)\n def test_case_9(self):\n # Ensure function can handle files in subdirectories of the source directory\n sub_dir = os.path.join(self.source_dir, \"subdir\")\n os.makedirs(sub_dir)\n txt_path = os.path.join(sub_dir, \"sample.txt\")\n with open(txt_path, \"w\") as f:\n f.write(\"Hello, nested world!\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)", "apis": ["os.path.exists", "pandas.read_csv", "pandas.DataFrame", "os.walk", "os.path", "docx.Document", "os.makedirs", "pandas.read_excel", "os.path.join", "pathlib.Path"], "libs": ["pandas", "os", "pathlib", "docx"], "doc": {"description": ["Converts files with specific extensions (.txt, .docx, .xlsx, .csv) from a source directory to CSV files", "and saves them in a target directory.", "Notes:", "- Each file's text content is captured and stored in a CSV with a single 'Text' column and no row indices.", "- This function will overwrite existing files in the target directory if they have the same names as the", "converted files."], "note": [], "params": ["source_directory (str): The path to the source directory containing the files to be converted.", "target_directory (str): The path to the target directory where the converted CSV files will be saved.", "If it does not exist, the function will create it."], "returns": ["int: The number of files successfully converted to CSV."], "reqs": ["os", "pathlib", "pandas", "python-docx", "openpyxl"], "raises": ["FileNotFoundError: If the source directory does not exist."], "example": [">>> f_807('/Users/test/Documents', '/Users/test/Documents/csv_files')", "4", ">>> f_807('/path/to/source', '/path/to/target')", "2"]}} -{"task_id": "f_786", "prompt": "import pandas as pd\nimport numpy as np\nfrom statsmodels.tsa.seasonal import seasonal_decompose\nimport random \n\ndef f_786(start_date='2016-01-01', periods=24, freq='M', model='additive'):\n \"\"\"\n Generate a sales time-series and decompose it into trend, seasonal, and residual components.\n \n Parameters:\n - start_date (str): The start date of the time-series in the format 'YYYY-MM-DD'. Default is '2016-01-01'.\n - periods (int): The number of periods to generate for the time-series. Default is 24.\n - freq (str): The frequency of the time-series data. Default is 'M' (Monthly End).\n - model (str): The type of seasonal decomposition ('additive' or 'multiplicative'). Default is 'additive'.\n\n Returns:\n - A dictionary containing 'trend', 'seasonal', and 'residual' components as Pandas Series.\n\n Examples:\n >>> result = f_786('2016-01-01', 24, 'M')\n >>> all(key in result for key in ['trend', 'seasonal', 'residual'])\n True\n\n >>> result = f_786('2020-01-01', 24, 'M', 'multiplicative')\n >>> len(result['seasonal'])\n 24\n \"\"\"", "canonical_solution": " date_range = pd.date_range(start=start_date, periods=periods, freq=freq)\n sales_data = np.random.randint(low=100, high=500, size=periods)\n sales_series = pd.Series(sales_data, index=date_range)\n try:\n decomposition = seasonal_decompose(sales_series, model=model, period=12 if freq == 'M' else 4)\n except ValueError as e:\n return {'error': str(e)}\n \n return {\n 'trend': decomposition.trend,\n 'seasonal': decomposition.seasonal,\n 'residual': decomposition.resid\n }", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_default_parameters(self):\n random.seed(42) # For reproducibility\n result = f_786(periods=24) # Adjust to meet the minimum requirement for decomposition\n self.assertTrue(all(key in result for key in ['trend', 'seasonal', 'residual']))\n def test_multiplicative_model(self):\n random.seed(0) # For reproducibility\n result = f_786('2020-01-01', 24, 'M', 'multiplicative')\n self.assertTrue(all(key in result for key in ['trend', 'seasonal', 'residual']))\n def test_custom_parameters(self):\n random.seed(55) # For reproducibility\n result = f_786('2017-01-01', 36, 'M')\n self.assertEqual(len(result['trend']), 36)\n def test_weekly_frequency(self):\n random.seed(1) # For reproducibility\n result = f_786('2022-01-01', 104, 'W', 'additive')\n self.assertTrue(all(key in result for key in ['trend', 'seasonal', 'residual']))\n self.assertEqual(len(result['seasonal']), 104)\n \n def test_insufficient_periods_error(self):\n random.seed(66) # For reproducibility\n result = f_786('2022-01-01', 12, 'M')\n self.assertIn('error', result)\n \n def test_additive_decomposition_properties(self):\n random.seed(42) # For reproducibility\n periods = 36\n result = f_786('2020-01-01', periods, 'M')\n reconstructed = result['trend'].fillna(0) + result['seasonal'].fillna(0) + result['residual'].fillna(0)\n self.assertTrue(np.allclose(reconstructed.head(12), reconstructed.head(12), atol=1))", "apis": ["numpy.random.randint", "numpy.random", "pandas.Series", "pandas.date_range", "statsmodels.tsa.seasonal.seasonal_decompose"], "libs": ["numpy", "pandas", "statsmodels"], "doc": {"description": ["Generate a sales time-series and decompose it into trend, seasonal, and residual components.", ">>> result = f_786('2020-01-01', 24, 'M', 'multiplicative')", ">>> len(result['seasonal'])", "24"], "note": [], "params": ["start_date (str): The start date of the time-series in the format 'YYYY-MM-DD'. Default is '2016-01-01'.", "periods (int): The number of periods to generate for the time-series. Default is 24.", "freq (str): The frequency of the time-series data. Default is 'M' (Monthly End).", "model (str): The type of seasonal decomposition ('additive' or 'multiplicative'). Default is 'additive'."], "returns": ["A dictionary containing 'trend', 'seasonal', and 'residual' components as Pandas Series."], "reqs": [], "raises": [], "example": ["Examples:", ">>> result = f_786('2016-01-01', 24, 'M')", ">>> all(key in result for key in ['trend', 'seasonal', 'residual'])", "True"]}} -{"task_id": "f_853", "prompt": "import requests\nfrom PIL import Image\nimport io\n\n\ndef f_853(url):\n \"\"\"\n Fetches an image from a given URL and returns it as a PIL Image object.\n\n Parameters:\n - url (str): The URL of the image to download. It should be a valid HTTP or\n HTTPS URL pointing directly to an image file.\n\n Returns:\n - PIL.Image.Image: A PIL Image object representing the downloaded image. This\n object can be manipulated or displayed using PIL's image processing\n capabilities.\n\n Raises:\n - ValueError: This exception is raised in the following scenarios:\n - The URL is invalid or cannot be reached within the timeout period (5 seconds).\n - The response from the server is not a successful HTTP status code (i.e., not in the range 200-299).\n - The content fetched from the URL is not a valid image format that can be handled by PIL.\n\n Requirements:\n - requests\n - PIL\n - io\n\n Example:\n >>> img = f_853('https://example.com/image.jpg')\n >>> isinstance(img, Image.Image)\n True\n\n Note:\n - The function uses a timeout of 5 seconds for the HTTP request to prevent\n indefinite waiting in case of unresponsive URLs.\n - The function will not handle redirections or authentication scenarios. It\n expects a direct link to an image resource.\n \"\"\"", "canonical_solution": " try:\n response = requests.get(url, timeout=5)\n response.raise_for_status()\n image = Image.open(io.BytesIO(response.content))\n return image\n except Exception as e:\n raise ValueError(f\"Failed to retrieve image from {url}: {e}\") from e", "test": "import unittest\nfrom unittest.mock import patch\nfrom PIL import Image\nfrom pathlib import Path\nimport shutil\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_853 function.\"\"\"\n directory = \"mnt/data/f_852_data_chien\"\n @classmethod\n def setUpClass(cls):\n \"\"\"Setup method to create a sample image inr test files.\"\"\"\n # Create directory if it doesn't exist\n cls.test_dir = Path(cls.directory)\n cls.test_dir.mkdir(parents=True, exist_ok=True)\n # Create and save a sample image\n cls.sample_image_path = Path(cls.test_dir) / \"sample_image.png\"\n sample_image = Image.new(\"RGBA\", (100, 100), color=\"blue\")\n sample_image.save(cls.sample_image_path)\n @patch(\"requests.get\")\n def test_valid_image_url(self, mock_get):\n \"\"\"Test f_853 function with a valid image URL.\"\"\"\n with open(self.sample_image_path, \"rb\") as image_file:\n mock_get.return_value.content = image_file.read()\n img = f_853(\"https://www.google.com/images/srpr/logo11w.png\")\n self.assertIsInstance(img, Image.Image, \"Returned object is not a PIL Image\")\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"Test f_853 function with an invalid URL (not an image).\"\"\"\n mock_get.side_effect = ValueError(\"Invalid URL\")\n with self.assertRaises(ValueError):\n f_853(\"https://www.google.com\")\n @patch(\"requests.get\")\n def test_nonexistent_url(self, mock_get):\n \"\"\"Test f_853 function with a nonexistent URL.\"\"\"\n mock_get.side_effect = ValueError(\"Nonexistent URL\")\n with self.assertRaises(ValueError):\n f_853(\"https://example.com/nonexistent_image.jpg\")\n @patch(\"requests.get\")\n def test_image_properties(self, mock_get):\n \"\"\"Test f_853 function with a known image and check its properties.\"\"\"\n with open(self.sample_image_path, \"rb\") as image_file:\n mock_get.return_value.content = image_file.read()\n img = f_853(\"https://www.google.com/images/srpr/logo11w.png\")\n self.assertEqual(img.format, \"PNG\", \"Image format does not match expected\")\n self.assertEqual(img.size, (100, 100), \"Image size does not match expected\")\n @patch(\"requests.get\")\n def test_image_mode(self, mock_get):\n \"\"\"Test f_853 function with a known image and check its mode.\"\"\"\n with open(self.sample_image_path, \"rb\") as image_file:\n mock_get.return_value.content = image_file.read()\n img = f_853(\"https://www.google.com/images/srpr/logo11w.png\")\n self.assertEqual(img.mode, \"RGBA\", \"Image mode does not match expected\")\n @classmethod\n def tearDownClass(cls):\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["requests.get", "io.BytesIO", "PIL.Image.open"], "libs": ["requests", "PIL", "io"], "doc": {"description": ["Fetches an image from a given URL and returns it as a PIL Image object."], "note": ["The function uses a timeout of 5 seconds for the HTTP request to prevent", "indefinite waiting in case of unresponsive URLs.", "The function will not handle redirections or authentication scenarios. It", "expects a direct link to an image resource."], "params": ["url (str): The URL of the image to download. It should be a valid HTTP or", "HTTPS URL pointing directly to an image file."], "returns": ["PIL.Image.Image: A PIL Image object representing the downloaded image. This", "object can be manipulated or displayed using PIL's image processing", "capabilities."], "reqs": ["requests", "PIL", "io"], "raises": ["ValueError: This exception is raised in the following scenarios:", "The URL is invalid or cannot be reached within the timeout period (5 seconds).", "The response from the server is not a successful HTTP status code (i.e., not in the range 200-299).", "The content fetched from the URL is not a valid image format that can be handled by PIL."], "example": [">>> img = f_853('https://example.com/image.jpg')", ">>> isinstance(img, Image.Image)", "True"]}} -{"task_id": "f_911", "prompt": "import requests\nimport logging\n\ndef f_911(repo_url: str) -> dict:\n \"\"\"\n Fetches and returns information about a GitHub repository using its API URL. The function makes an HTTP GET\n request to the provided repository URL. It incorporates error handling for various scenarios including API\n rate limits, other HTTP errors, and general request issues. The function also checks for a large number of\n open issues in the repository and prints a warning if they exceed a certain threshold.\n\n Parameters:\n - repo_url (str): The URL of the GitHub repository API.\n\n Returns:\n - dict: A dictionary containing information about the GitHub repository.\n\n Raises:\n - requests.exceptions.HTTPError: If an HTTP error occurs, particularly when the GitHub API rate limit is\n exceeded.\n - requests.exceptions.RequestException: For other general issues encountered during the API request, such\n as network problems, invalid responses, or timeouts.\n\n Requirements:\n - requests\n - logging\n\n Example:\n >>> f_911('https://api.github.com/repos/psf/requests')\n { ... } # dictionary containing repo information\n >>> f_911('https://api.github.com/repos/some/repo')\n { ... } # dictionary containing repo information with a possible runtime warning about open issues\n \"\"\"", "canonical_solution": " try:\n response = requests.get(repo_url, timeout=2)\n response.raise_for_status() # Raises HTTPError for bad requests\n repo_info = response.json()\n if (\n response.status_code == 403\n and repo_info.get(\"message\") == \"API rate limit exceeded\"\n ):\n raise requests.exceptions.HTTPError(\"API rate limit exceeded\")\n\n if repo_info.get(\"open_issues_count\", 0) > 10000:\n logging.warning(\"The repository has more than 10000 open issues.\")\n\n return repo_info\n\n except requests.exceptions.RequestException as e:\n raise requests.exceptions.RequestException(\n f\"Error fetching repo info: {e}\"\n ) from e", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nfrom io import StringIO\nfrom contextlib import redirect_stdout\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_911.\"\"\"\n @patch(\"requests.get\")\n def test_successful_response(self, mock_get):\n \"\"\"\n Test f_911 with a successful response.\n \"\"\"\n mock_get.return_value = MagicMock(\n status_code=200, json=lambda: {\"open_issues_count\": 5000}\n )\n response = f_911(\"https://api.github.com/repos/psf/requests\")\n self.assertIn(\"open_issues_count\", response)\n self.assertEqual(response[\"open_issues_count\"], 5000)\n @patch(\"requests.get\")\n @patch('logging.warning')\n def test_response_with_more_than_10000_issues(self, mock_warning, mock_get):\n \"\"\"\n Test f_911 with a response indicating more than 10000 open issues.\n \"\"\"\n mock_get.return_value = MagicMock(\n status_code=200, json=lambda: {\"open_issues_count\": 15000}\n )\n \n response = f_911(\"https://api.github.com/repos/psf/requests\")\n \n mock_warning.assert_called_once_with(\"The repository has more than 10000 open issues.\")\n self.assertEqual(response[\"open_issues_count\"], 15000)\n @patch(\"requests.get\")\n def test_api_rate_limit_exceeded(self, mock_get):\n \"\"\"\n Test f_911 handling API rate limit exceeded error.\n \"\"\"\n mock_get.return_value = MagicMock(\n status_code=403, json=lambda: {\"message\": \"API rate limit exceeded\"}\n )\n with self.assertRaises(Exception) as context:\n f_911(\"https://api.github.com/repos/psf/requests\")\n self.assertIn(\"API rate limit exceeded\", str(context.exception))\n @patch(\"requests.get\")\n def test_http_error(self, mock_get):\n \"\"\"\n Test f_911 handling HTTP errors.\n \"\"\"\n mock_get.side_effect = requests.exceptions.HTTPError(\n \"404 Client Error: Not Found for url\"\n )\n with self.assertRaises(Exception) as context:\n f_911(\"https://api.github.com/repos/psf/requests\")\n self.assertIn(\"404 Client Error\", str(context.exception))\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"\n Test f_911 with an invalid URL.\n \"\"\"\n mock_get.side_effect = requests.exceptions.InvalidURL(\"Invalid URL\")\n with self.assertRaises(Exception) as context:\n f_911(\"invalid_url\")\n self.assertIn(\"Invalid URL\", str(context.exception))", "apis": ["requests.exceptions.HTTPError", "requests.exceptions.RequestException", "logging.warning", "requests.get", "requests.exceptions"], "libs": ["requests", "logging"], "doc": {"description": ["Fetches and returns information about a GitHub repository using its API URL. The function makes an HTTP GET", "request to the provided repository URL. It incorporates error handling for various scenarios including API", "rate limits, other HTTP errors, and general request issues. The function also checks for a large number of", "open issues in the repository and prints a warning if they exceed a certain threshold."], "note": [], "params": ["repo_url (str): The URL of the GitHub repository API."], "returns": ["dict: A dictionary containing information about the GitHub repository."], "reqs": ["requests", "logging"], "raises": ["requests.exceptions.HTTPError: If an HTTP error occurs, particularly when the GitHub API rate limit is", "exceeded.", "requests.exceptions.RequestException: For other general issues encountered during the API request, such", "as network problems, invalid responses, or timeouts."], "example": [">>> f_911('https://api.github.com/repos/psf/requests')", "{ ... } # dictionary containing repo information", ">>> f_911('https://api.github.com/repos/some/repo')", "{ ... } # dictionary containing repo information with a possible runtime warning about open issues"]}} -{"task_id": "f_738", "prompt": "from collections import Counter\nimport random\nimport itertools\n\ndef f_738(length, count, seed=0):\n \"\"\"\n Generate a number of random strings with a specified length from a fixed set of letters ('a', 'b', 'c', 'd', 'e'),\n and analyze the frequency of each letter in the generated strings.\n \n Parameters:\n - length (int): The length of each string to be generated. Should be a non-negative integer.\n - count (int): The number of random strings to generate. Should be a non-negative integer.\n - seed (int, optional): A seed for the random number generator to ensure reproducibility.\n \n Requirements:\n - collections.Counter\n - random\n - itertools\n \n Returns:\n - Counter: A collections.Counter object containing the frequency of each letter in the generated strings.\n \n Example:\n >>> f_738(5, 2, seed=1)\n Counter({'a': 3, 'd': 3, 'c': 2, 'e': 1, 'b': 1})\n >>> f_738(0, 100, seed=2)\n Counter()\n \"\"\"", "canonical_solution": " random.seed(seed)\n strings = [''.join(random.choices(['a', 'b', 'c', 'd', 'e'], k=length)) for _ in range(count)]\n letter_frequency = Counter(itertools.chain(*strings))\n \n return letter_frequency", "test": "import unittest\nfrom collections import Counter\nclass TestCases(unittest.TestCase):\n def test_length_one_count_ten(self):\n result = f_738(1, 10, seed=0)\n self.assertIsInstance(result, Counter)\n self.assertEqual(sum(result.values()), 10, \"The total count of letters should be 10.\")\n \n def test_length_five_count_hundred(self):\n result = f_738(5, 100, seed=1)\n self.assertIsInstance(result, Counter)\n self.assertEqual(sum(result.values()), 500, \"The total count of letters should be 500.\")\n \n def test_zero_length(self):\n result = f_738(0, 100, seed=2)\n self.assertIsInstance(result, Counter)\n self.assertEqual(sum(result.values()), 0, \"With length 0, there should be no letters.\")\n \n def test_zero_count(self):\n result = f_738(5, 0, seed=3)\n self.assertIsInstance(result, Counter)\n self.assertEqual(sum(result.values()), 0, \"With count 0, there should be no letters.\")\n \n def test_specific_distribution(self):\n # Assuming the seed value of 4 leads to a specific, known distribution\n result = f_738(5, 2, seed=4)\n # Correct the expected distribution based on actual output\n correct_expected_distribution = Counter({'b': 3, 'a': 3, 'e': 2, 'c': 1, 'd': 1})\n self.assertEqual(result, correct_expected_distribution, \"The letter distribution should match the expected distribution.\")", "apis": ["itertools.chain", "random.choices", "random.seed", "collections.Counter"], "libs": ["collections", "random", "itertools"], "doc": {"description": ["Generate a number of random strings with a specified length from a fixed set of letters ('a', 'b', 'c', 'd', 'e'),", "and analyze the frequency of each letter in the generated strings."], "note": [], "params": ["length (int): The length of each string to be generated. Should be a non-negative integer.", "count (int): The number of random strings to generate. Should be a non-negative integer.", "seed (int, optional): A seed for the random number generator to ensure reproducibility."], "returns": ["Counter: A collections.Counter object containing the frequency of each letter in the generated strings."], "reqs": ["collections.Counter", "random", "itertools"], "raises": [], "example": [">>> f_738(5, 2, seed=1)", "Counter({'a': 3, 'd': 3, 'c': 2, 'e': 1, 'b': 1})", ">>> f_738(0, 100, seed=2)", "Counter()"]}} -{"task_id": "f_854", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_854(csv_file_path, col1_name=\"column1\", col2_name=\"column2\"):\n \"\"\"\n Reads data from a CSV file and generates a bar plot based on grouped mean values.\n\n The DataFrame is grouped by the column named 'col1_name',\n and the mean for each group is calculated for the column 'col2_name'.\n A bar plot is created using matplotlib. Each bar in the plot represents a group,\n and its height corresponds to the mean value of 'col2_name' for that group.\n The plot is then configured with a title and axis labels:\n - The title is set as \"Mean of [col2_name] Grouped by [col1_name]\".\n This format dynamically inserts the names of the columns being analyzed into the title.\n - The xlabel (label for the x-axis) is set to the name of the column used for grouping (col1_name).\n - The ylabel (label for the y-axis) is set as \"Mean of [col2_name]\",\n indicating that the y-axis represents the mean values of the specified column.\n\n Parameters:\n - csv_file_path (str): The file path to the CSV file.\n This parameter is mandatory and specifies the location of the CSV file to be read.\n - col1_name (str, optional): The name of the column used for grouping the data.\n If not provided, defaults to 'column1'. This column should exist in the CSV file.\n - col2_name (str, optional): The name of the column for which the mean is calculated for each group.\n If not provided, defaults to 'column2'. This column should exist in the CSV file and contain numerical data.\n\n Returns:\n - matplotlib.axes.Axes: The Axes object of the generated bar plot.\n This object can be used to further customize the plot, like adding labels or changing styles.\n\n Requirements:\n - pandas\n - matplotlib\n\n Example:\n >>> ax = f_854(\"data.csv\", \"group_column\", \"value_column\")\n >>> ax.get_title()\n 'Mean of value_column Grouped by group_column'\n\n Note:\n - Ensure that the CSV file exists at the specified path and has the required columns.\n - The function does not handle missing data. Ensure that the CSV file has clean and complete data for accurate results.\n - The bar plot is customizable using matplotlib's functionality after the function returns the Axes object.\n \"\"\"", "canonical_solution": " df = pd.read_csv(csv_file_path)\n groupby_data = df.groupby(col1_name)[col2_name].mean()\n\n _, ax = plt.subplots(figsize=(10, 6))\n ax.bar(groupby_data.index, groupby_data.values)\n ax.set_title(f\"Mean of {col2_name} Grouped by {col1_name}\")\n ax.set_xlabel(col1_name)\n ax.set_ylabel(f\"Mean of {col2_name}\")\n\n return ax", "test": "import unittest\nimport pandas as pd\nfrom unittest.mock import patch\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n def setUp(self):\n # Define mock data\n self.data = {\n \"sample_data\": pd.DataFrame(\n {\"column1\": [\"A\", \"A\", \"B\", \"B\"], \"column2\": [1, 2, 3, 4]}\n ),\n \"different_data\": pd.DataFrame(\n {\"column1\": [\"C\", \"C\", \"D\", \"D\"], \"column2\": [5, 6, 7, 8]}\n ),\n \"missing_values\": pd.DataFrame(\n {\"column1\": [\"A\", \"A\", \"B\", \"B\"], \"column2\": [1, None, 3, None]}\n ),\n \"different_columns\": pd.DataFrame(\n {\"col1\": [\"E\", \"E\", \"F\", \"F\"], \"col2\": [9, 10, 11, 12]}\n ),\n \"single_group_data\": pd.DataFrame(\n {\"column1\": [\"A\", \"A\", \"A\"], \"column2\": [1, 2, 3]}\n ),\n \"non_numeric_data\": pd.DataFrame(\n {\"column1\": [\"A\", \"B\", \"C\"], \"column2\": [\"x\", \"y\", \"z\"]}\n ),\n }\n @patch(\"pandas.read_csv\")\n def test_bar_plot(self, mock_read_csv):\n \"\"\"Test standard bar plot generation with sample data.\"\"\"\n mock_read_csv.return_value = self.data[\"sample_data\"]\n ax = f_854(\"any_path.csv\", \"column1\", \"column2\")\n self.check_plot(ax, \"sample_data\", \"column1\", \"column2\")\n @patch(\"pandas.read_csv\")\n def test_different_data(self, mock_read_csv):\n \"\"\"Test bar plot with different data set.\"\"\"\n mock_read_csv.return_value = self.data[\"different_data\"]\n ax = f_854(\"any_path.csv\", \"column1\", \"column2\")\n self.check_plot(ax, \"different_data\", \"column1\", \"column2\")\n @patch(\"pandas.read_csv\")\n def test_missing_values(self, mock_read_csv):\n \"\"\"Test bar plot with missing values in data.\"\"\"\n mock_read_csv.return_value = self.data[\"missing_values\"]\n ax = f_854(\"any_path.csv\", \"column1\", \"column2\")\n self.check_plot(ax, \"missing_values\", \"column1\", \"column2\")\n @patch(\"pandas.read_csv\")\n def test_different_column_names(self, mock_read_csv):\n \"\"\"Test bar plot with different column names.\"\"\"\n mock_read_csv.return_value = self.data[\"different_columns\"]\n ax = f_854(\"any_path.csv\", \"col1\", \"col2\")\n self.check_plot(ax, \"different_columns\", \"col1\", \"col2\")\n @patch(\"pandas.read_csv\")\n def test_single_group_data(self, mock_read_csv):\n \"\"\"Test bar plot with data containing only a single group.\"\"\"\n mock_read_csv.return_value = self.data[\"single_group_data\"]\n ax = f_854(\"any_path.csv\", \"column1\", \"column2\")\n self.check_plot(ax, \"single_group_data\", \"column1\", \"column2\")\n @patch(\"pandas.read_csv\")\n def test_non_numeric_aggregation_column(self, mock_read_csv):\n \"\"\"Test bar plot with non-numeric data in the aggregation column.\"\"\"\n mock_read_csv.return_value = self.data[\"non_numeric_data\"]\n with self.assertRaises(TypeError):\n f_854(\"any_path.csv\", \"column1\", \"column2\")\n def check_plot(self, ax, data_key, col1, col2):\n \"\"\"Check the generated bar plot.\"\"\"\n # Use the correct DataFrame for expected calculations\n df = self.data[data_key]\n # Common assertions for checking plot\n expected_title = f\"Mean of {col2} Grouped by {col1}\"\n self.assertEqual(ax.get_title(), expected_title)\n self.assertEqual(ax.get_xlabel(), col1)\n self.assertEqual(ax.get_ylabel(), f\"Mean of {col2}\")\n # Check the bars in the plot\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n expected_means = df.groupby(col1)[col2].mean().values\n self.assertListEqual(bar_heights, list(expected_means))\n def tearDown(self):\n plt.close()", "apis": ["pandas.read_csv", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Reads data from a CSV file and generates a bar plot based on grouped mean values.", "The DataFrame is grouped by the column named 'col1_name',", "and the mean for each group is calculated for the column 'col2_name'.", "A bar plot is created using matplotlib. Each bar in the plot represents a group,", "and its height corresponds to the mean value of 'col2_name' for that group.", "The plot is then configured with a title and axis labels:", "- The title is set as \"Mean of [col2_name] Grouped by [col1_name]\".", "This format dynamically inserts the names of the columns being analyzed into the title.", "- The xlabel (label for the x-axis) is set to the name of the column used for grouping (col1_name).", "- The ylabel (label for the y-axis) is set as \"Mean of [col2_name]\",", "indicating that the y-axis represents the mean values of the specified column."], "note": ["Ensure that the CSV file exists at the specified path and has the required columns.", "The function does not handle missing data. Ensure that the CSV file has clean and complete data for accurate results.", "The bar plot is customizable using matplotlib's functionality after the function returns the Axes object."], "params": ["csv_file_path (str): The file path to the CSV file.", "This parameter is mandatory and specifies the location of the CSV file to be read.", "col1_name (str, optional): The name of the column used for grouping the data.", "If not provided, defaults to 'column1'. This column should exist in the CSV file.", "col2_name (str, optional): The name of the column for which the mean is calculated for each group.", "If not provided, defaults to 'column2'. This column should exist in the CSV file and contain numerical data."], "returns": ["matplotlib.axes.Axes: The Axes object of the generated bar plot.", "This object can be used to further customize the plot, like adding labels or changing styles."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> ax = f_854(\"data.csv\", \"group_column\", \"value_column\")", ">>> ax.get_title()", "'Mean of value_column Grouped by group_column'"]}} -{"task_id": "f_420", "prompt": "import numpy as np\nfrom collections import Counter\nfrom scipy.stats import norm\nimport matplotlib.pyplot as plt\n\n\ndef f_420(df, bins=4):\n \"\"\"\n Identify and count duplicate values in a DataFrame's 'value' column.\n This function also plots a histogram for all values in the 'value' column\n and overlays a normal distribution curve on the histogram.\n\n Parameters:\n df (pd.DataFrame): DataFrame containing a numeric 'value' column. If empty,\n the function will return empty Counter and an empty plot.\n bins (int, optional): Number of bins for the histogram. Defaults to 4.\n\n Returns:\n tuple: A tuple containing:\n - Counter: A Counter object with the count of each duplicate value.\n - Axes: A matplotlib.axes.Axes object that represents the plot\n of the histogram with the 'value' column data. If applicable,\n a normal distribution curve fitted to the data is overlaid. The\n histogram's bars are green with 60% opacity, and the normal\n distribution curve is black with a linewidth of 2. The plot is\n titled \"Distribution\", with \"Value\" as the x-axis label and\n \"Frequency\" as the y-axis label.\n\n Requirements:\n - collections.Counter\n - numpy\n - scipy.stats.norm\n - matplotlib.pyplot\n\n Example:\n >>> df = pd.DataFrame({'value': [1, 2, 2, 3, 3, 4, 3, 2, 1, 4, 4, 4, 2, 2, 3, 1, 1, 1, 3, 2]})\n >>> counter, ax = f_420(df)\n >>> ax\n \n >>> counter\n Counter({2: 6, 1: 5, 3: 5, 4: 4})\n \"\"\"", "canonical_solution": " # Filter only duplicate values\n duplicates = df[df[\"value\"].duplicated(keep=False)]\n duplicates_counter = Counter(duplicates[\"value\"])\n\n # Check if data is empty or constant\n if df.empty or df[\"value\"].nunique() == 1:\n mu, std = None, None\n else:\n mu, std = norm.fit(df[\"value\"])\n\n fig, ax = plt.subplots()\n ax.hist(df[\"value\"], bins=bins, density=True, alpha=0.6, color=\"g\")\n if mu is not None and std is not None:\n xmin, xmax = plt.xlim()\n x = np.linspace(xmin, xmax, 100)\n p = norm.pdf(x, mu, std)\n ax.plot(x, p, \"k\", linewidth=2)\n ax.set_xlabel(\"Value\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(\"Distribution\")\n\n return duplicates_counter, ax", "test": "import unittest\nimport pandas as pd\nfrom collections import Counter\nimport matplotlib\nclass TestCases(unittest.TestCase):\n def _check_plot(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Distribution\")\n self.assertEqual(ax.get_xlabel(), \"Value\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_1(self):\n # Basic case - no repeated value\n df = pd.DataFrame({\"value\": [1, 2, 3, 4, 5]})\n counter, ax = f_420(df)\n self._check_plot(ax)\n self.assertEqual(counter, Counter())\n def test_case_2(self):\n # Basic case - all repeated values\n df = pd.DataFrame({\"value\": [1, 1, 1, 1, 1]})\n counter, ax = f_420(df)\n self._check_plot(ax)\n self.assertEqual(counter, Counter({1: 5}))\n def test_case_3(self):\n # Basic case - test empty\n df = pd.DataFrame({\"value\": []})\n counter, ax = f_420(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(counter, Counter())\n def test_case_4(self):\n # Basic case with more diverse data distribution\n df = pd.DataFrame({\"value\": [5, 5, 5, 5, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4]})\n counter, ax = f_420(df)\n self._check_plot(ax)\n self.assertEqual(counter, Counter({5: 4, 1: 4, 2: 3, 3: 2}))\n def test_case_5(self):\n # Test bins explicitly\n np.random.seed(0)\n df = pd.DataFrame({\"value\": np.random.rand(100)})\n for bins in [2, 10, 20]:\n _, ax = f_420(df, bins=bins)\n self.assertEqual(\n len(ax.patches), bins, f\"Expected {bins} bins in the histogram.\"\n )\n def test_case_6(self):\n # Test handling non-numeric value\n df = pd.DataFrame({\"value\": [\"a\", \"b\", \"c\", \"a\", \"b\", \"b\"]})\n with self.assertRaises(TypeError):\n f_420(df)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["scipy.stats.norm.pdf", "collections.Counter", "matplotlib.pyplot.xlim", "scipy.stats.norm.fit", "numpy.linspace", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "collections", "numpy", "scipy"], "doc": {"description": ["Identify and count duplicate values in a DataFrame's 'value' column.", "This function also plots a histogram for all values in the 'value' column", "and overlays a normal distribution curve on the histogram."], "note": [], "params": ["df (pd.DataFrame): DataFrame containing a numeric 'value' column. If empty,", "the function will return empty Counter and an empty plot.", "bins (int, optional): Number of bins for the histogram. Defaults to 4."], "returns": ["tuple: A tuple containing:", "Counter: A Counter object with the count of each duplicate value.", "Axes: A matplotlib.axes.Axes object that represents the plot", "of the histogram with the 'value' column data. If applicable,", "a normal distribution curve fitted to the data is overlaid. The", "histogram's bars are green with 60% opacity, and the normal", "distribution curve is black with a linewidth of 2. The plot is", "titled \"Distribution\", with \"Value\" as the x-axis label and", "\"Frequency\" as the y-axis label."], "reqs": ["collections.Counter", "numpy", "scipy.stats.norm", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({'value': [1, 2, 2, 3, 3, 4, 3, 2, 1, 4, 4, 4, 2, 2, 3, 1, 1, 1, 3, 2]})", ">>> counter, ax = f_420(df)", ">>> ax", "", ">>> counter", "Counter({2: 6, 1: 5, 3: 5, 4: 4})"]}} -{"task_id": "f_788", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LinearRegression\n\ndef f_788(start_date='2016-01-01', periods=13, freq='WOM-2FRI', sales_data=None):\n \"\"\"\n Generate a sales series and forecast future sales using linear regression.\n \n Functionality:\n - This function generates a time series of sales data starting from a specified date, then uses linear regression to forecast future sales based on the provided or generated sales data.\n \n Input:\n - start_date (str): The start date for the sales data in YYYY-MM-DD format. Default is '2016-01-01'.\n - periods (int): The number of periods for which the sales data is available. Default is 13.\n - freq (str): The frequency of the sales data, e.g., 'WOM-2FRI' for the second Friday of each month. Default is 'WOM-2FRI'.\n - sales_data (array-like, optional): An array containing actual sales data. If not provided, random data will be generated.\n \n Output:\n - A numpy array containing the forecasted future sales for the same number of periods as the input data.\n \n Examples:\n >>> np.random.seed(42) # For consistent random data generation in examples\n >>> f_788('2016-01-01', 13, 'WOM-2FRI')\n array([313.65384615, 318.56043956, 323.46703297, 328.37362637,\n 333.28021978, 338.18681319, 343.09340659, 348. ,\n 352.90659341, 357.81318681, 362.71978022, 367.62637363,\n 372.53296703])\n >>> f_788('2020-01-01', 5, 'M', [200, 300, 400, 500, 600])\n array([238.9, 226. , 213.1, 200.2, 187.3])\n \"\"\"", "canonical_solution": " sales_data = np.random.randint(low=100, high=500, size=periods)\n \n date_range = pd.date_range(start=start_date, freq=freq, periods=periods)\n sales_df = pd.DataFrame({'Date': date_range, 'Sales': sales_data})\n \n X = np.arange(len(sales_df)).reshape(-1, 1)\n y = sales_df['Sales'].values\n \n model = LinearRegression()\n model.fit(X, y)\n \n future_dates = np.arange(len(sales_df), 2*len(sales_df)).reshape(-1, 1)\n future_sales = model.predict(future_dates)\n \n return future_sales", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_with_default_parameters(self):\n np.random.seed(42) # For consistent test setup\n forecasted_sales = f_788()\n self.assertIsInstance(forecasted_sales, np.ndarray)\n self.assertEqual(forecasted_sales.shape[0], 13)\n \n def test_with_custom_parameters(self):\n np.random.seed(0) # For consistent test setup\n forecasted_sales = f_788('2020-01-01', 10, 'M', [200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100])\n self.assertIsInstance(forecasted_sales, np.ndarray)\n self.assertEqual(forecasted_sales.shape[0], 10)\n \n def test_with_random_sales_data(self):\n np.random.seed(55) # For consistent test setup\n forecasted_sales = f_788(periods=5)\n self.assertIsInstance(forecasted_sales, np.ndarray)\n self.assertEqual(forecasted_sales.shape[0], 5)\n \n def test_forecasted_values_increasing(self):\n np.random.seed(66) # For consistent test setup\n sales_data = [100, 150, 200, 250, 300]\n forecasted_sales = f_788('2021-01-01', 5, 'M', sales_data)\n self.assertFalse(all(forecasted_sales[i] <= forecasted_sales[i + 1] for i in range(len(forecasted_sales) - 1)))\n \n def test_with_specific_sales_data(self):\n np.random.seed(42) # For consistent test setup\n sales_data = [100, 200, 300, 400, 500]\n forecasted_sales = f_788('2022-01-01', 5, 'Q', sales_data)\n self.assertIsInstance(forecasted_sales, np.ndarray)\n self.assertEqual(forecasted_sales.shape[0], 5)", "apis": ["numpy.arange", "numpy.random.randint", "numpy.random", "pandas.date_range", "sklearn.linear_model.LinearRegression", "pandas.DataFrame"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Generate a sales series and forecast future sales using linear regression.", "Functionality:", "- This function generates a time series of sales data starting from a specified date, then uses linear regression to forecast future sales based on the provided or generated sales data.", "Input:", "- start_date (str): The start date for the sales data in YYYY-MM-DD format. Default is '2016-01-01'.", "- periods (int): The number of periods for which the sales data is available. Default is 13.", "- freq (str): The frequency of the sales data, e.g., 'WOM-2FRI' for the second Friday of each month. Default is 'WOM-2FRI'.", "- sales_data (array-like, optional): An array containing actual sales data. If not provided, random data will be generated.", "Output:", "- A numpy array containing the forecasted future sales for the same number of periods as the input data."], "note": [], "params": [], "returns": [], "reqs": [], "raises": [], "example": ["Examples:", ">>> np.random.seed(42) # For consistent random data generation in examples", ">>> f_788('2016-01-01', 13, 'WOM-2FRI')", "array([313.65384615, 318.56043956, 323.46703297, 328.37362637,", "333.28021978, 338.18681319, 343.09340659, 348. ,", "352.90659341, 357.81318681, 362.71978022, 367.62637363,", "372.53296703])", ">>> f_788('2020-01-01', 5, 'M', [200, 300, 400, 500, 600])", "array([238.9, 226. , 213.1, 200.2, 187.3])"]}} -{"task_id": "f_387", "prompt": "import pandas as pd\nfrom datetime import datetime, timedelta\nimport random\n\n\ndef f_387(epoch_milliseconds, seed=0):\n \"\"\"\n Generate user activity logs from a given epoch time to the current time.\n\n This function iterates from the starting epoch time to the current system\n time, incrementally increasing the time by a random number of seconds (an\n integer in [1, 10]) between each log entry. Each log entry records a user\n performing an activity at a specific time.\n\n Parameters:\n - epoch_milliseconds (int): Starting epoch time in milliseconds. Must be in\n the past compared to current system time.\n - seed (int): random seed for reproducibility. Defaults to 0.\n\n Returns:\n - pd.DataFrame: A DataFrame containing logs of user activities, with columns:\n - 'User': User names, randomly chosen from a predefined list of users,\n ['user1', 'user2', 'user3', 'user4', 'user5'].\n - 'Activity': Activities performed by the users, randomly chosen from a\n predefined list of activities, ['login', 'logout', 'browse',\n 'search', 'purchase'].\n - 'Time': The timestamp of when the activity occurred, incrementally\n increasing from the starting epoch time to the current time.\n\n Requirements:\n - pandas\n - datetime.datetime.fromtimestamp\n - datetime.timedelta\n - random\n\n Example:\n >>> log = f_387(1615168051807)\n >>> type(log)\n \n >>> log.iloc[0]\n User user4\n Activity search\n Time 2021-03-08 12:47:31.807000\n Name: 0, dtype: object\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n USERS = [\"user1\", \"user2\", \"user3\", \"user4\", \"user5\"]\n ACTIVITIES = [\"login\", \"logout\", \"browse\", \"search\", \"purchase\"]\n\n start_time = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n end_time = datetime.now()\n if start_time >= end_time:\n raise ValueError(\"Start time must be before current system time\")\n\n logs = []\n current_time = start_time\n while current_time <= end_time:\n user = random.choice(USERS)\n activity = random.choice(ACTIVITIES)\n logs.append([user, activity, current_time])\n current_time += timedelta(seconds=random.randint(1, 10))\n log_df = pd.DataFrame(logs, columns=[\"User\", \"Activity\", \"Time\"])\n return log_df", "test": "import unittest\nimport pandas as pd\nfrom datetime import datetime, timedelta\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic functionality - 1 day ago\n epoch_milliseconds = int(\n (datetime.now() - timedelta(days=1)).timestamp() * 1000\n )\n log = f_387(epoch_milliseconds)\n self.assertTrue(isinstance(log, pd.DataFrame))\n self.assertTrue(\"User\" in log.columns)\n self.assertTrue(\"Activity\" in log.columns)\n self.assertTrue(\"Time\" in log.columns)\n start_time = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n self.assertEqual(log.iloc[0][\"Time\"], start_time)\n def test_case_2(self):\n # Test with a short time frame - 1 minutes ago\n epoch_milliseconds = int(\n (datetime.now() - timedelta(minutes=1)).timestamp() * 1000\n )\n log = f_387(epoch_milliseconds)\n self.assertTrue(len(log) > 0) # Should have at least one entry\n self.assertTrue(\n log[\"Time\"].min() >= datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n )\n def test_case_3(self):\n # Test with a specific seed\n epoch_milliseconds = int(\n (datetime.now() - timedelta(days=1)).timestamp() * 1000\n )\n seed = 42\n log = f_387(epoch_milliseconds, seed=seed)\n first_row = log.iloc[0]\n expected_user = \"user1\"\n expected_activity = \"login\"\n self.assertEqual(first_row[\"User\"], expected_user)\n self.assertEqual(first_row[\"Activity\"], expected_activity)\n def test_case_4(self):\n # Test functionality over a longer period - 1 month ago\n epoch_milliseconds = int(\n (datetime.now() - timedelta(days=30)).timestamp() * 1000\n )\n log = f_387(epoch_milliseconds)\n # Ensure that log timestamps are properly incrementing\n time_diffs = log[\"Time\"].diff().dropna()\n self.assertTrue(all(time_diffs > timedelta(seconds=0)))\n seconds_in_a_month = (\n 30 * 24 * 60 * 60\n ) # Approximate number of seconds in a month\n max_possible_entries = (\n seconds_in_a_month # Assuming a minimum of 1-second increments\n )\n min_possible_entries = (\n seconds_in_a_month // 10\n ) # Assuming a maximum of 10-second increments\n # Verify that the log has a reasonable number of entries given the time frame\n self.assertTrue(min_possible_entries <= len(log) <= max_possible_entries)\n self.assertTrue(\n log[\"Time\"].min() >= datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n )\n self.assertTrue(log[\"Time\"].max() <= datetime.now())\n def test_case_5(self):\n # Test invalid start time (future)\n epoch_milliseconds = int(\n (datetime.now() + timedelta(days=1)).timestamp() * 1000\n )\n with self.assertRaises(Exception):\n f_387(epoch_milliseconds)", "apis": ["random.randint", "random.seed", "random.choice", "datetime.datetime.now", "datetime.timedelta", "datetime.datetime.fromtimestamp", "pandas.DataFrame"], "libs": ["pandas", "random", "datetime"], "doc": {"description": ["Generate user activity logs from a given epoch time to the current time.", "This function iterates from the starting epoch time to the current system", "time, incrementally increasing the time by a random number of seconds (an", "integer in [1, 10]) between each log entry. Each log entry records a user", "performing an activity at a specific time."], "note": [], "params": ["epoch_milliseconds (int): Starting epoch time in milliseconds. Must be in", "the past compared to current system time.", "seed (int): random seed for reproducibility. Defaults to 0."], "returns": ["pd.DataFrame: A DataFrame containing logs of user activities, with columns:", "'User': User names, randomly chosen from a predefined list of users,", "['user1', 'user2', 'user3', 'user4', 'user5'].", "'Activity': Activities performed by the users, randomly chosen from a", "predefined list of activities, ['login', 'logout', 'browse',", "'search', 'purchase'].", "'Time': The timestamp of when the activity occurred, incrementally", "increasing from the starting epoch time to the current time."], "reqs": ["pandas", "datetime.datetime.fromtimestamp", "datetime.timedelta", "random"], "raises": [], "example": [">>> log = f_387(1615168051807)", ">>> type(log)", "", ">>> log.iloc[0]", "User user4", "Activity search", "Time 2021-03-08 12:47:31.807000", "Name: 0, dtype: object"]}} -{"task_id": "f_750", "prompt": "import os\nimport pandas as pd\nimport re\nimport matplotlib.pyplot as plt\nimport numpy as np\n\ndef f_750(directory: str, pattern: str) -> list:\n \"\"\"\n Searches a directory for CSV files matching a given regular expression pattern,\n reads sales data from these files, and plots the sales data with month on the x-axis and sales on the y-axis.\n \n Note:\n - Each CSV file contains two columns: 'Month' and 'Sales'.\n\n Parameters:\n - directory (str): The directory path where the CSV files are located.\n - pattern (str): The regular expression pattern to match the filenames.\n\n Returns:\n - A list of matplotlib.axes._subplots.AxesSubplot objects, each representing a plot of sales data from a matched CSV file.\n\n Example usage:\n >>> axes = f_750('/path/to/data/', r'^sales_data_\\d{4}.csv')\n >>> len(axes)\n 2\n >>> axes[0].get_title()\n 'sales_data_2021.csv'\n \"\"\"", "canonical_solution": "\n plots = []\n for file in os.listdir(directory):\n if re.match(pattern, file):\n df = pd.read_csv(os.path.join(directory, file))\n ax = df.plot(x='Month', y='Sales', title=file)\n plots.append(ax)\n plt.show()\n return plots", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n # Prepare test data\n cls.directory = \"f_750_data_wenhao/\"\n cls.pattern = r\"^sales_data_\\d{4}.csv\"\n os.makedirs(cls.directory, exist_ok=True)\n data_2021 = pd.DataFrame({\n 'Month': ['January', 'February', 'March'],\n 'Sales': [100, 150, 200]\n })\n data_2022 = pd.DataFrame({\n 'Month': ['January', 'February', 'March'],\n 'Sales': [120, 130, 210]\n })\n data_2021.to_csv(cls.directory + \"sales_data_2021.csv\", index=False)\n data_2022.to_csv(cls.directory + \"sales_data_2022.csv\", index=False)\n @classmethod\n def tearDownClass(cls):\n # Clean up test data\n shutil.rmtree(cls.directory)\n def test_plots_generated(self):\n plots = f_750(self.directory, self.pattern)\n self.assertEqual(len(plots), 2, \"Should generate two plots for two CSV files\")\n def test_plot_titles(self):\n plots = f_750(self.directory, self.pattern)\n expected_titles = ['sales_data_2022.csv', 'sales_data_2021.csv']\n plot_titles = [plot.get_title() for plot in plots]\n self.assertEqual(plot_titles, expected_titles, \"Plot titles should match the CSV filenames\")\n def test_no_files_matched(self):\n plots = f_750(self.directory, r\"^no_match_\\d{4}.csv\")\n self.assertEqual(len(plots), 0, \"Should return an empty list if no files match the pattern\")\n def test_invalid_directory(self):\n with self.assertRaises(FileNotFoundError):\n f_750(\"/invalid/directory/\", self.pattern)\n def test_plot_data_integrity(self):\n plots = f_750(self.directory, self.pattern)\n # Read the CSV files again to get expected data\n expected_data = []\n for file in os.listdir(self.directory):\n if re.match(self.pattern, file):\n df = pd.read_csv(os.path.join(self.directory, file))\n expected_data.append(df['Sales'].to_list())\n for plot, expected_sales in zip(plots, expected_data):\n lines = plot.get_lines()\n for line in lines:\n y_data = line.get_ydata()\n # Use np.isclose for floating point comparison, if necessary\n self.assertTrue(any(np.array_equal(y_data, expected) for expected in expected_data), \"Plotted data should match the CSV file content\")", "apis": ["pandas.read_csv", "os.listdir", "matplotlib.pyplot.show", "os.path", "os.path.join", "re.match"], "libs": ["matplotlib", "pandas", "re", "os"], "doc": {"description": ["Searches a directory for CSV files matching a given regular expression pattern,", "reads sales data from these files, and plots the sales data with month on the x-axis and sales on the y-axis.", "Example usage:", ">>> axes = f_750('/path/to/data/', r'^sales_data_\\d{4}.csv')", ">>> len(axes)", "2", ">>> axes[0].get_title()", "'sales_data_2021.csv'"], "note": ["Each CSV file contains two columns: 'Month' and 'Sales'."], "params": ["directory (str): The directory path where the CSV files are located.", "pattern (str): The regular expression pattern to match the filenames."], "returns": ["A list of matplotlib.axes._subplots.AxesSubplot objects, each representing a plot of sales data from a matched CSV file."], "reqs": [], "raises": [], "example": []}} -{"task_id": "f_572", "prompt": "import numpy as np\nimport math\nimport random\nfrom random import uniform\n\n\ndef f_572(radius, num_points):\n \"\"\"\n Create a tuple with a list of random points within a circle of a given radius.\n \n Parameters:\n - radius (int): The radius of the circle.\n - num_points (int): The number of points to be generated.\n\n Returns:\n - out (list): A list of points within a circle.\n\n Requirements:\n - numpy\n - math\n - random\n\n Example:\n >>> random.seed(42)\n >>> f_572(1, 3)\n [(-0.10124546928297637, -0.12149119380571095), (-0.07399370924760951, 0.46662154808860146), (-0.06984148700093858, -0.8196472742078809)]\n \"\"\"", "canonical_solution": " out = []\n \n for _ in range(num_points):\n theta = uniform(0, 2*np.pi)\n r = radius * math.sqrt(uniform(0, 1))\n x = r * math.cos(theta)\n y = r * math.sin(theta)\n out.append((x, y))\n \n return out", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n points = f_572(1, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 1)\n def test_case_2(self):\n points = f_572(2, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 4)\n def test_case_3(self):\n points = f_572(3, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 9)\n def test_case_4(self):\n points = f_572(4, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 16)\n def test_case_5(self):\n points = f_572(5, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 25)", "apis": ["math.cos", "math.sqrt", "numpy.pi", "random.uniform", "math.sin"], "libs": ["numpy", "random", "math"], "doc": {"description": ["Create a tuple with a list of random points within a circle of a given radius."], "note": [], "params": ["radius (int): The radius of the circle.", "num_points (int): The number of points to be generated."], "returns": ["out (list): A list of points within a circle."], "reqs": ["numpy", "math", "random"], "raises": [], "example": [">>> random.seed(42)", ">>> f_572(1, 3)", "[(-0.10124546928297637, -0.12149119380571095), (-0.07399370924760951, 0.46662154808860146), (-0.06984148700093858, -0.8196472742078809)]"]}} -{"task_id": "f_410", "prompt": "import collections\nimport matplotlib.pyplot as plt\n\n\ndef f_410(data):\n \"\"\"\n Combine a list of dictionaries with possibly differing keys (student names) into a single dictionary,\n calculate the average score for each student, and return a bar chart of average student scores with\n student on the x-axis and average score on the y-axis.\n\n This function handles data with varying dictionary lengths and missing keys by averaging available scores,\n ignoring None. If there is any negative score, the function raises ValueError.\n Bar colors can be: 'red', 'yellow', 'green', 'blue', 'purple'.\n\n Parameters:\n data (list): A list of dictionaries. The keys are student names and the values are scores.\n\n Returns:\n ax (matplotlib.axes._axes.Axes or None): A bar chart showing the 'Average Student Scores', with\n 'Student' on the x-axis and 'Average Score' on the y-axis.\n If data is empty, return None.\n\n Requirements:\n - collections\n - matplotlib.pyplot\n\n Example:\n >>> data = [{'John': 5, 'Jane': 10, 'Joe': 7},\\\n {'John': 6, 'Jane': 8, 'Joe': 10},\\\n {'John': 5, 'Jane': 9, 'Joe': 8},\\\n {'John': 7, 'Jane': 10, 'Joe': 9}]\n >>> ax = f_410(data)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0, 0, 'Jane'), Text(1, 0, 'Joe'), Text(2, 0, 'John')]\n \"\"\"", "canonical_solution": " if not data:\n return None\n\n combined_dict = {}\n for d in data:\n for k, v in d.items():\n if v is None:\n continue\n elif v < 0:\n raise ValueError(\"Scores must be non-negative.\")\n if k in combined_dict:\n combined_dict[k].append(v)\n else:\n combined_dict[k] = [v]\n\n avg_scores = {k: sum(v) / len(v) for k, v in combined_dict.items()}\n avg_scores = collections.OrderedDict(sorted(avg_scores.items()))\n labels, values = zip(*avg_scores.items())\n\n fig, ax = plt.subplots()\n ax.bar(labels, values, color=[\"red\", \"yellow\", \"green\", \"blue\", \"purple\"])\n ax.set_title(\"Average Student Scores\")\n ax.set_xlabel(\"Student\")\n ax.set_ylabel(\"Average Score\")\n\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def _check_plot_structure(self, ax):\n # Assert type of returned object\n self.assertIsInstance(ax, plt.Axes)\n # Check plot title, x-label, y-label\n self.assertEqual(ax.get_title(), \"Average Student Scores\")\n self.assertEqual(ax.get_xlabel(), \"Student\")\n self.assertEqual(ax.get_ylabel(), \"Average Score\")\n def test_case_1(self):\n # Test multiple users multiple data points\n data = [\n {\"John\": 5, \"Jane\": 10, \"Joe\": 7},\n {\"John\": 6, \"Jane\": 8, \"Joe\": 10},\n {\"John\": 5, \"Jane\": 9, \"Joe\": 8},\n {\"John\": 7, \"Jane\": 10, \"Joe\": 9},\n ]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights (average scores)\n for bar, label in zip(ax.containers[0], [\"Jane\", \"Joe\", \"John\"]):\n if label == \"Jane\":\n self.assertEqual(bar.get_height(), 9.25)\n elif label == \"Joe\":\n self.assertEqual(bar.get_height(), 8.5)\n elif label == \"John\":\n self.assertEqual(bar.get_height(), 5.75)\n def test_case_2(self):\n # Test same user multiple data points\n data = [{\"John\": 5}, {\"John\": 6}, {\"John\": 7}, {\"John\": 8}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights (average scores)\n for bar, _ in zip(ax.containers[0], [\"John\"]):\n self.assertEqual(bar.get_height(), 6.5)\n def test_case_3(self):\n # Test with multiple students and one data point each\n data = [{\"John\": 10}, {\"Jane\": 15}, {\"Joe\": 20}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights match the single data point for each student\n expected_scores = {\"Jane\": 15, \"Joe\": 20, \"John\": 10}\n for bar, label in zip(ax.containers[0], expected_scores.keys()):\n self.assertEqual(bar.get_height(), expected_scores[label])\n def test_case_4(self):\n # Test multiple users multiple data points different lengths\n data = [{\"Jane\": 10, \"Joe\": 7}, {\"Joe\": 10}, {\"Jane\": 9, \"John\": 8}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights (average scores)\n for bar, label in zip(ax.containers[0], [\"Jane\", \"Joe\"]):\n if label == \"Jane\":\n self.assertAlmostEqual(bar.get_height(), 9.5, places=2)\n elif label == \"Joe\":\n self.assertAlmostEqual(bar.get_height(), 8.5, places=2)\n def test_case_5(self):\n # Test handling None\n data = [\n {\"Jane\": 10, \"Joe\": 7},\n {\"Joe\": 10, \"Jane\": None, \"John\": None},\n {\"Jane\": 9, \"John\": 8},\n {\"Joe\": None},\n ]\n ax = f_410(data)\n self._check_plot_structure(ax) # Results should be same as test_case_4\n for bar, label in zip(ax.containers[0], [\"Jane\", \"Joe\"]):\n if label == \"Jane\":\n self.assertAlmostEqual(bar.get_height(), 9.5, places=2)\n elif label == \"Joe\":\n self.assertAlmostEqual(bar.get_height(), 8.5, places=2)\n def test_case_6(self):\n # Test only one data point with multiple students\n data = [{\"John\": 5, \"Jane\": 10}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights (average scores)\n for bar, label in zip(ax.containers[0], [\"Jane\", \"John\"]):\n if label == \"Jane\":\n self.assertEqual(bar.get_height(), 10)\n elif label == \"John\":\n self.assertEqual(bar.get_height(), 5)\n def test_case_7(self):\n # Test empty input\n data = []\n ax = f_410(data)\n self.assertIsNone(ax)\n def test_case_8(self):\n # Test with data containing negative scores\n data = [{\"John\": -2, \"Jane\": 3}, {\"John\": -4, \"Jane\": 5}]\n with self.assertRaises(ValueError):\n f_410(data)\n def test_case_9(self):\n # Test with a larger dataset\n data = [{\"John\": i} for i in range(1000)]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar height for the large dataset (average should be close to 499.5)\n self.assertAlmostEqual(\n next(iter(ax.containers[0])).get_height(), 499.5, places=2\n )\n def test_case_10(self):\n # Test with some negative scores mixed with positive ones\n data = [{\"John\": 5, \"Jane\": -1}, {\"John\": -2, \"Jane\": 2}]\n with self.assertRaises(ValueError):\n f_410(data)\n def test_case_11(self):\n # Test with all scores as 0\n data = [{\"John\": 0, \"Jane\": 0}, {\"John\": 0, \"Jane\": 0}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights are 0 for all students\n for bar, label in zip(ax.containers[0], [\"Jane\", \"John\"]):\n self.assertEqual(bar.get_height(), 0)\n def test_case_12(self):\n # Test with some dictionaries being empty\n data = [{\"John\": 5}, {}, {\"Jane\": 10}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check that the empty dictionary does not affect the output\n expected_scores = {\"Jane\": 10, \"John\": 5}\n for bar, label in zip(ax.containers[0], expected_scores.keys()):\n self.assertEqual(bar.get_height(), expected_scores[label])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "collections.OrderedDict"], "libs": ["matplotlib", "collections"], "doc": {"description": ["Combine a list of dictionaries with possibly differing keys (student names) into a single dictionary,", "calculate the average score for each student, and return a bar chart of average student scores with", "student on the x-axis and average score on the y-axis.", "This function handles data with varying dictionary lengths and missing keys by averaging available scores,", "ignoring None. If there is any negative score, the function raises ValueError.", "Bar colors can be: 'red', 'yellow', 'green', 'blue', 'purple'."], "note": [], "params": ["data (list): A list of dictionaries. The keys are student names and the values are scores."], "returns": ["ax (matplotlib.axes._axes.Axes or None): A bar chart showing the 'Average Student Scores', with", "'Student' on the x-axis and 'Average Score' on the y-axis.", "If data is empty, return None."], "reqs": ["collections", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [{'John': 5, 'Jane': 10, 'Joe': 7},\\", "{'John': 6, 'Jane': 8, 'Joe': 10},\\", "{'John': 5, 'Jane': 9, 'Joe': 8},\\", "{'John': 7, 'Jane': 10, 'Joe': 9}]", ">>> ax = f_410(data)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0, 0, 'Jane'), Text(1, 0, 'Joe'), Text(2, 0, 'John')]"]}} +{"task_id": "f_365", "prompt": "import numpy as np\nimport pandas as pd\n\n\ndef f_365(data_str, separator=\",\", bins=20):\n \"\"\"\n Convert a string of numerical values separated by a specified separator into a pandas\n integer series, and then draw a histogram of the data.\n\n The function raises a ValueError if data is empty or it fails to convert the data.\n It plots the histogram with the following attributes:\n - grid: True\n - rwidth: 0.9\n - color: '#607c8e'\n\n Parameters:\n - data_str (str): The string of numbers separated by the specified separator.\n - separator (str, optional): The separator used in the data string. Default is ','.\n - bins (int, optional): Number of histogram bins. Default is 20.\n\n Returns:\n - tuple: A tuple containing:\n 1. Series: A pandas Series of the data coonverted into integers.\n 2. Axes: The Axes object of the plotted histogram.\n\n Requirements:\n - numpy\n - pandas\n\n Example:\n >>> series, ax = f_365('1,2,3,4,5,5,5,4,3,2,1')\n >>> print(type(series), series.tolist())\n [1, 2, 3, 4, 5, 5, 5, 4, 3, 2, 1]\n >>> print(type(ax))\n \n \"\"\"", "canonical_solution": "\n data = np.fromstring(data_str, sep=separator)\n if data.size == 0:\n raise ValueError(\"Failed to find valid data\")\n\n data = pd.Series(data, dtype='int64')\n ax = data.plot.hist(grid=True, bins=bins, rwidth=0.9, color=\"#607c8e\")\n return data, ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib\nfrom matplotlib import pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self) -> None:\n self.default_str = \"1,2,3,4,5,5,5,4,3,2,1\"\n self.default_expected = pd.Series([1, 2, 3, 4, 5, 5, 5, 4, 3, 2, 1])\n def assertHistogramAttributes(self, series, ax):\n # Check that the y-axis gridlines are set to True\n self.assertTrue(ax.yaxis.grid)\n # Ensure the histogram bars have the correct color\n self.assertEqual(matplotlib.colors.to_hex(ax.patches[0].get_fc()), \"#607c8e\")\n # Validate the heights of the histogram bars\n for patch in ax.patches:\n if (\n round(patch.get_x()) in series.values\n or round(patch.get_x() + patch.get_width()) in series.values\n ):\n self.assertTrue(patch.get_height() >= 0)\n def test_case_1(self):\n # Test default case\n series, ax = f_365(self.default_str)\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, self.default_expected)\n def test_case_2(self):\n # Test function works on different bin sizes\n for bins in [5, 10, 15, 30, 100]:\n with self.subTest(bins=bins):\n series, ax = f_365(self.default_str, bins=bins)\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, self.default_expected)\n def test_case_3(self):\n # Test custom separators\n data_str = \"1|2|3|4|5\"\n series, ax = f_365(data_str, separator=\"|\")\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, pd.Series([1, 2, 3, 4, 5]))\n def test_case_4(self):\n # Test negative and zero\n data_str = \"-5,-4,-3,-2,-1,0\"\n series, ax = f_365(data_str)\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, pd.Series([-5, -4, -3, -2, -1, 0]))\n def test_case_5(self):\n # Test single item\n data_str = \"1\"\n series, ax = f_365(data_str)\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, pd.Series([1]))\n def test_case_6(self):\n # Test with float\n series, ax = f_365(\"1.0,2.0,3.0,4.0,5.0,5.0,5.0,4.0,3.0,2.0,1.0\")\n self.assertIsInstance(series, pd.Series)\n self.assertHistogramAttributes(series, ax)\n pd.testing.assert_series_equal(series, self.default_expected)\n def test_case_7(self):\n # Test with empty string\n data_str = \"\"\n with self.assertRaises(ValueError):\n f_365(data_str)\n def test_case_8(self):\n # Test with invalid data (contains string)\n data_str = \"a,b,c, 1\"\n with self.assertRaises(ValueError):\n f_365(data_str)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.fromstring", "pandas.Series"], "libs": ["pandas", "numpy"], "doc": {"description": ["Convert a string of numerical values separated by a specified separator into a pandas", "integer series, and then draw a histogram of the data.", "The function raises a ValueError if data is empty or it fails to convert the data.", "It plots the histogram with the following attributes:", "- grid: True", "- rwidth: 0.9", "- color: '#607c8e'"], "note": [], "params": ["data_str (str): The string of numbers separated by the specified separator.", "separator (str, optional): The separator used in the data string. Default is ','.", "bins (int, optional): Number of histogram bins. Default is 20."], "returns": ["tuple: A tuple containing:", "1. Series: A pandas Series of the data coonverted into integers.", "2. Axes: The Axes object of the plotted histogram."], "reqs": ["numpy", "pandas"], "raises": [], "example": [">>> series, ax = f_365('1,2,3,4,5,5,5,4,3,2,1')", ">>> print(type(series), series.tolist())", " [1, 2, 3, 4, 5, 5, 5, 4, 3, 2, 1]", ">>> print(type(ax))", ""]}} +{"task_id": "f_526", "prompt": "import shutil\nimport os\nimport fnmatch\nimport itertools\n\ndef f_526(src_dir, dst_dir):\n \"\"\"\n Copy all files from 'src_dir' to 'dst_dir' that match any pattern in ['*.txt', '*.docx'].\n\n Parameters:\n - src_dir (str): The source directory.\n - dst_dir (str): The destination directory.\n\n Returns:\n - str: The destination directory.\n \n Requirements:\n - shutil\n - os\n - fnmatch\n - itertools\n\n Example:\n >>> f_526('./source', './destination')\n >>> './destination'\n \"\"\"", "canonical_solution": " FILE_PATTERNS = ['*.txt', '*.docx']\n # Find all matching files\n matching_files = list(itertools.chain.from_iterable(\n fnmatch.filter(os.listdir(src_dir), pattern) for pattern in FILE_PATTERNS))\n\n for filename in matching_files:\n shutil.copy2(os.path.join(src_dir, filename), dst_dir)\n\n return dst_dir", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def base(self, src_dir, dst_dir):\n if os.path.exists(src_dir):\n shutil.rmtree(src_dir)\n # Create source directory\n os.mkdir(src_dir)\n # Create destination directory\n os.mkdir(dst_dir)\n # Create files\n for filename in ['a.txt', 'b.txt', 'c.docx', 'd.docx', 'e.txt', 'a.pdf', 'a.doc']:\n with open(os.path.join(src_dir, filename), 'w') as f:\n f.write('test')\n # Run function\n f_526(src_dir, dst_dir)\n # Check files\n for d in [src_dir, dst_dir]:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'b.txt')))\n self.assertTrue(os.path.exists(os.path.join(d, 'c.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'd.docx')))\n self.assertTrue(os.path.exists(os.path.join(d, 'e.txt')))\n self.assertFalse(os.path.exists(os.path.join(d, 'f.txt')))\n if d == src_dir:\n self.assertTrue(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertTrue(os.path.exists(os.path.join(d, 'a.doc')))\n else:\n self.assertFalse(os.path.exists(os.path.join(d, 'a.pdf')))\n self.assertFalse(os.path.exists(os.path.join(d, 'a.doc')))\n \n def tearDown(self):\n for d in ['./source', './destination', './src', './dst', './s', './d']:\n if os.path.exists(d):\n shutil.rmtree(d)\n def test_case_1(self):\n self.base('./source', './destination')\n \n def test_case_2(self):\n self.base('./src', './dst')\n \n def test_case_3(self):\n self.base('./s', './d')\n \n def test_case_4(self):\n self.base('./s', './destination')\n def test_case_5(self):\n self.base('./source', './d')", "apis": ["shutil.copy2", "fnmatch.filter", "os.listdir", "os.path", "os.path.join", "itertools.chain.from_iterable", "itertools.chain"], "libs": ["fnmatch", "os", "shutil", "itertools"], "doc": {"description": ["Copy all files from 'src_dir' to 'dst_dir' that match any pattern in ['*.txt', '*.docx']."], "note": [], "params": ["src_dir (str): The source directory.", "dst_dir (str): The destination directory."], "returns": ["str: The destination directory."], "reqs": ["shutil", "os", "fnmatch", "itertools"], "raises": [], "example": [">>> f_526('./source', './destination')", ">>> './destination'"]}} +{"task_id": "f_588", "prompt": "import pandas as pd\nfrom sklearn.cluster import DBSCAN\n\ndef f_588(data, cols):\n \"\"\"\n Perform DBSCAN clustering on the data by transforming it into a DataFrame and recording the clusters in a new column named 'Cluster'.\n Please choose the parameters eps=3 and min_samples=2.\n \n Parameters:\n - data (list): List of lists with the data, where the length of the inner list equals the number of columns\n - cols (list): List of column names\n \n Returns:\n - df (DataFrame): The DataFrame with a new 'Cluster' column.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> data = [[5.1, 3.5], [4.9, 3.0], [4.7, 3.2]]\n >>> cols = ['x', 'y']\n >>> df = f_588(data, cols)\n >>> print(df)\n x y Cluster\n 0 5.1 3.5 0\n 1 4.9 3.0 0\n 2 4.7 3.2 0\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data, columns=cols)\n dbscan = DBSCAN(eps=3, min_samples=2)\n df['Cluster'] = dbscan.fit_predict(df)\n return df", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = f_588([[5.1, 3.5], [4.9, 3.0], [4.7, 3.2]], ['x', 'y'])\n print(df)\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0])))\n def test_case_2(self):\n df = f_588([[1, 2], [3, 4], [5, 6]], ['x', 'y'])\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0])))\n def test_case_3(self):\n df = f_588([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], ['x', 'y'])\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0, 1, 1, -1])))\n def test_case_4(self):\n df = f_588([[1, 2, 3], [2, 2, 2], [2, 3, 4], [8, 7, 6], [8, 8, 8], [25, 80, 100]], ['x', 'y', 'z'])\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0, 1, 1, -1])))\n def test_case_5(self):\n df = f_588([[-1, -2], [-2, -2], [-2, -3], [-8, -7], [-8, -8], [-25, -80]], ['x', 'y'])\n self.assertTrue('Cluster' in df.columns)\n self.assertTrue(np.array_equal(df['Cluster'], np.array([0, 0, 0, 1, 1, -1])))", "apis": ["pandas.DataFrame", "sklearn.cluster.DBSCAN"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Perform DBSCAN clustering on the data by transforming it into a DataFrame and recording the clusters in a new column named 'Cluster'.", "Please choose the parameters eps=3 and min_samples=2."], "note": [], "params": ["data (list): List of lists with the data, where the length of the inner list equals the number of columns", "cols (list): List of column names"], "returns": ["df (DataFrame): The DataFrame with a new 'Cluster' column."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> data = [[5.1, 3.5], [4.9, 3.0], [4.7, 3.2]]", ">>> cols = ['x', 'y']", ">>> df = f_588(data, cols)", ">>> print(df)", "x y Cluster", "0 5.1 3.5 0", "1 4.9 3.0 0", "2 4.7 3.2 0"]}} +{"task_id": "f_894", "prompt": "import os\nimport hashlib\n\n# Constants\nDIRECTORY = \"./hashed_files\"\n\n\ndef f_894(input_string):\n \"\"\"\n Hash each non-empty line of a multi-line string using SHA256 and save the hashes to files.\n The filename is the first 10 characters of the hash, with a '.txt' extension.\n\n Parameters:\n - input_string (str): A multi-line string to be processed.\n\n Returns:\n - list[str]: A list of file paths where the hashes of non-empty lines are saved.\n\n Requirements:\n - os\n - hashlib\n\n Notes:\n - If the DIRECTORY does not exist, it is created.\n - Empty lines in the input string are ignored.\n\n Example:\n >>> file_paths = f_894('line a\\nfollows by line b\\n\\n...bye\\n')\n >>> print(file_paths)\n ['./hashed_files/489fe1fa6c.txt', './hashed_files/67009597fe.txt', './hashed_files/eab4758603.txt']\n \"\"\"", "canonical_solution": " if not os.path.exists(DIRECTORY):\n os.makedirs(DIRECTORY)\n\n file_paths = []\n lines = input_string.split(\"\\n\")\n for line in lines:\n if line: # Check if line is not empty\n line_hash = hashlib.sha256(line.encode()).hexdigest()\n filename = line_hash[:10] + \".txt\"\n filepath = os.path.join(DIRECTORY, filename)\n with open(filepath, \"w\", encoding=\"utf-8\") as file:\n file.write(line_hash)\n file_paths.append(filepath)\n\n return file_paths", "test": "import unittest\nimport os\nimport hashlib\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_894.\"\"\"\n @classmethod\n def setUpClass(cls):\n \"\"\"Set up a temporary directory for test files.\"\"\"\n cls.temp_directory = \"./temp_test_files\"\n os.makedirs(cls.temp_directory, exist_ok=True)\n @classmethod\n def tearDownClass(cls):\n \"\"\"Clean up by removing the temporary directory after tests.\"\"\"\n shutil.rmtree(cls.temp_directory)\n dirs_to_remove = [\"hashed_files\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)\n def test_single_line(self):\n \"\"\"Test with a single line input.\"\"\"\n input_string = \"Hello world\"\n expected = [os.path.join(\"./hashed_files\", \"64ec88ca00.txt\")]\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_multi_line(self):\n \"\"\"Test with a multi-line input.\"\"\"\n input_string = \"First line\\nSecond line\\nThird line\"\n expected = [\n os.path.join(\"./hashed_files\", \"2361df1018.txt\"),\n os.path.join(\"./hashed_files\", \"c8b588f708.txt\"),\n os.path.join(\"./hashed_files\", \"3195807ae4.txt\"),\n ]\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_empty_input(self):\n \"\"\"Test with an empty string.\"\"\"\n input_string = \"\"\n expected = []\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_input_with_empty_lines(self):\n \"\"\"Test input string containing empty lines.\"\"\"\n input_string = \"Line one\\n\\nLine two\\n\"\n expected = [\n os.path.join(\"./hashed_files\", \"209f4c0be3.txt\"),\n os.path.join(\"./hashed_files\", \"1ae5466eb8.txt\"),\n ]\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_no_newline_at_end(self):\n \"\"\"Test input string without a newline at the end.\"\"\"\n input_string = \"Line with no newline at end\"\n expected = [os.path.join(\"./hashed_files\", \"901dd863e9.txt\")]\n result = f_894(input_string)\n self.assertEqual(result, expected)\n def test_directory_creation(self):\n \"\"\"\n Test if the function creates the directory if it does not exist.\n \"\"\"\n # Assert that the DIRECTORY does not exist before calling the function\n self.assertFalse(os.path.exists(DIRECTORY))\n # Call the function with any string\n f_894(\"Test for directory creation\")\n # Check if the DIRECTORY has been created\n self.assertTrue(os.path.exists(DIRECTORY))\n # Optionally, clean up by removing the created directory after the test\n if os.path.exists(DIRECTORY):\n shutil.rmtree(DIRECTORY)", "apis": ["hashlib.sha256", "os.makedirs", "os.path", "os.path.join", "os.path.exists"], "libs": ["os", "hashlib"], "doc": {"description": ["Hash each non-empty line of a multi-line string using SHA256 and save the hashes to files.", "The filename is the first 10 characters of the hash, with a '.txt' extension.", "Notes:", "- If the DIRECTORY does not exist, it is created.", "- Empty lines in the input string are ignored."], "note": [], "params": ["input_string (str): A multi-line string to be processed."], "returns": ["list[str]: A list of file paths where the hashes of non-empty lines are saved."], "reqs": ["os", "hashlib"], "raises": [], "example": [">>> file_paths = f_894('line a\\nfollows by line b\\n\\n...bye\\n')", ">>> print(file_paths)", "['./hashed_files/489fe1fa6c.txt', './hashed_files/67009597fe.txt', './hashed_files/eab4758603.txt']"]}} +{"task_id": "f_352", "prompt": "import numpy as np\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\n\n\ndef f_352(data, n_components=2, random_state=None):\n \"\"\"\n Performs Principal Component Analysis (PCA) on the provided dataset to reduce its dimensionality,\n and visualizes the results using a scatter plot.\n\n This function applies PCA to the dataset, reducing its features to the specified number of principal components.\n It then visualizes the reduced data in a scatter plot. For datasets reduced to a single component, the function\n generates a 1D scatter plot along the X-axis, with all Y-values set to zero. For reductions resulting in two or more\n components, only the first two principal components are visualized.\n\n Parameters:\n - data (ndarray): A numpy ndarray of shape (n_samples, n_features) representing the data.\n - n_components (int, optional): Number of components to keep. Defaults to 2.\n - random_state (int, optional): Seed for reproducibility. Defaults to None.\n\n Returns:\n dict: A dictionary containing:\n - \"transformed_data\" (np.ndarray): The transformed data.\n - \"ax\" (plt.Axes): The scatter plot visualizing the transformed data.\n\n Requirements:\n - numpy\n - matplotlib\n - sklearn\n\n Example:\n >>> data = np.random.random((100, 5))\n >>> results = f_352(data, random_state=42)\n >>> results['transformed_data'].shape\n (100, 2)\n >>> type(results['ax'])\n \n \"\"\"", "canonical_solution": " pca = PCA(n_components=n_components, random_state=random_state)\n transformed_data = pca.fit_transform(data)\n\n fig, ax = plt.subplots()\n if transformed_data.shape[1] == 1:\n ax.scatter(transformed_data[:, 0], np.zeros_like(transformed_data[:, 0]))\n else:\n ax.scatter(transformed_data[:, 0], transformed_data[:, 1])\n\n return {\"transformed_data\": transformed_data, \"ax\": ax}", "test": "import unittest\nfrom sklearn.decomposition import PCA\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.seed = 42\n self.n = 100\n self.n_dims = 5\n self.n_components = 2\n self.data = np.random.RandomState(self.seed).random((self.n, self.n_dims))\n def assert_pca_correctness(self, data, results, n_components, random_state):\n \"\"\"Helper method to assert PCA correctness\"\"\"\n # 1. Variance explained\n pca = PCA(n_components=n_components, random_state=random_state)\n pca.fit(data)\n explained_variance_ratio = pca.explained_variance_ratio_\n if data.shape[1] == 1:\n # For one-dimensional data, the explained variance ratio should be 1\n self.assertAlmostEqual(explained_variance_ratio[0], 1.0, delta=1e-2)\n else:\n cov_matrix = np.cov(data, rowvar=False)\n eigenvalues = np.linalg.eigvals(cov_matrix)\n sorted_eigenvalues = np.sort(eigenvalues)[::-1][:n_components]\n normalized_eigenvalues = sorted_eigenvalues / sum(eigenvalues)\n self.assertTrue(\n np.allclose(explained_variance_ratio, normalized_eigenvalues, atol=1e-1)\n )\n # 2. Orthogonality\n for i in range(n_components):\n for j in range(i + 1, n_components):\n dot_product = np.dot(\n results[\"transformed_data\"][:, i], results[\"transformed_data\"][:, j]\n )\n self.assertAlmostEqual(dot_product, 0, delta=1e-2)\n def test_case_1(self):\n # Test with default settings\n results = f_352(self.data, random_state=self.seed)\n self.assertEqual(results[\"transformed_data\"].shape, (self.n, self.n_components))\n x_data = results[\"ax\"].collections[0].get_offsets()[:, 0]\n y_data = results[\"ax\"].collections[0].get_offsets()[:, 1]\n self.assertTrue(np.array_equal(x_data, results[\"transformed_data\"][:, 0]))\n self.assertTrue(np.array_equal(y_data, results[\"transformed_data\"][:, 1]))\n self.assert_pca_correctness(self.data, results, self.n_components, self.seed)\n def test_case_2(self):\n # Test n_components\n for n_components in [1, 2, min(self.data.shape)]:\n results = f_352(self.data, n_components=n_components, random_state=42)\n self.assertEqual(results[\"transformed_data\"].shape[1], n_components)\n self.assert_pca_correctness(self.data, results, n_components, self.seed)\n def test_case_3(self):\n # Test when one of the features has zero variance\n data = self.data.copy()\n data[:, 1] = 0 # Second feature has zero variance\n results = f_352(data, n_components=2, random_state=self.seed)\n self.assertEqual(results[\"transformed_data\"].shape, (100, 2))\n self.assert_pca_correctness(data, results, 2, self.seed)\n def test_case_4(self):\n # Test with n_components greater than min(n_samples, n_features)\n data = np.random.RandomState(self.seed).randn(10, 2)\n with self.assertRaises(ValueError):\n f_352(data, n_components=3, random_state=self.seed)\n def test_case_5(self):\n # Test with a single sample\n data = np.random.RandomState(self.seed).randn(1, self.n_dims)\n with self.assertRaises(ValueError):\n f_352(data)\n def test_case_6(self):\n # Edge case - test when dataset contains NaN\n data = self.data.copy()\n data[0, 0] = np.nan # Introduce a NaN value\n with self.assertRaises(ValueError):\n f_352(data, n_components=2, random_state=self.seed)\n def test_case_7(self):\n # Edge case - test when dataset contains infinite values\n data = self.data.copy()\n data[0, 0] = np.inf # Introduce an infinite value\n with self.assertRaises(ValueError):\n f_352(data, n_components=2, random_state=self.seed)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "numpy.zeros_like", "sklearn.decomposition.PCA"], "libs": ["sklearn", "numpy", "matplotlib"], "doc": {"description": ["Performs Principal Component Analysis (PCA) on the provided dataset to reduce its dimensionality,", "and visualizes the results using a scatter plot.", "This function applies PCA to the dataset, reducing its features to the specified number of principal components.", "It then visualizes the reduced data in a scatter plot. For datasets reduced to a single component, the function", "generates a 1D scatter plot along the X-axis, with all Y-values set to zero. For reductions resulting in two or more", "components, only the first two principal components are visualized."], "note": [], "params": ["data (ndarray): A numpy ndarray of shape (n_samples, n_features) representing the data.", "n_components (int, optional): Number of components to keep. Defaults to 2.", "random_state (int, optional): Seed for reproducibility. Defaults to None."], "returns": ["dict: A dictionary containing:", "\"transformed_data\" (np.ndarray): The transformed data.", "\"ax\" (plt.Axes): The scatter plot visualizing the transformed data."], "reqs": ["numpy", "matplotlib", "sklearn"], "raises": [], "example": [">>> data = np.random.random((100, 5))", ">>> results = f_352(data, random_state=42)", ">>> results['transformed_data'].shape", "(100, 2)", ">>> type(results['ax'])", ""]}} +{"task_id": "f_793", "prompt": "import numpy as np\nfrom scipy.linalg import svd\n\ndef f_793(rows=3, columns=2, seed=0):\n \"\"\"\n Generate a matrix of random values with specified dimensions and perform Singular Value Decomposition (SVD) on it.\n\n Requirements:\n - numpy\n - scipy.linalg.svd\n\n Parameters:\n - rows (int): Number of rows for the random matrix. Default is 3.\n - columns (int): Number of columns for the random matrix. Default is 2.\n - seed (int, optional): Seed for the random number generator to ensure reproducibility. Default is None.\n\n Returns:\n tuple: A tuple containing three elements:\n - U (ndarray): The unitary matrix U.\n - s (ndarray): The singular values, sorted in descending order.\n - Vh (ndarray): The conjugate transpose of the unitary matrix V.\n\n Example:\n >>> U, s, Vh = f_793(3, 2, seed=42)\n >>> print('U shape:', U.shape)\n U shape: (3, 3)\n >>> print('s shape:', s.shape)\n s shape: (2,)\n >>> print('Vh shape:', Vh.shape)\n Vh shape: (2, 2)\n \"\"\"", "canonical_solution": " np.random.seed(seed)\n matrix = np.random.rand(rows, columns)\n U, s, Vh = svd(matrix)\n\n return U, s, Vh", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n # Test with default 3x2 matrix\n U, s, Vh = f_793(seed=3)\n self.assertEqual(U.shape, (3, 3))\n self.assertEqual(s.shape, (2,))\n self.assertEqual(Vh.shape, (2, 2))\n self.assertTrue(np.all(s >= 0))\n \n def test_case_2(self):\n # Test with a 5x5 square matrix\n U, s, Vh = f_793(5, 5, seed=42)\n self.assertEqual(U.shape, (5, 5))\n self.assertEqual(s.shape, (5,))\n self.assertEqual(Vh.shape, (5, 5))\n self.assertTrue(np.all(s >= 0))\n \n def test_case_3(self):\n # Test with a 2x3 matrix (more columns than rows)\n U, s, Vh = f_793(2, 3, seed=12)\n self.assertEqual(U.shape, (2, 2))\n self.assertEqual(s.shape, (2,))\n self.assertEqual(Vh.shape, (3, 3))\n self.assertTrue(np.all(s >= 0))\n \n def test_case_4(self):\n # Test with a 1x1 matrix (a scalar)\n U, s, Vh = f_793(1, 1, seed=0)\n self.assertEqual(U.shape, (1, 1))\n self.assertEqual(s.shape, (1,))\n self.assertEqual(Vh.shape, (1, 1))\n self.assertTrue(np.all(s >= 0))\n \n def test_case_5(self):\n # Test with a 4x3 matrix\n U, s, Vh = f_793(4, 3, seed=1)\n self.assertEqual(U.shape, (4, 4))\n self.assertEqual(s.shape, (3,))\n self.assertEqual(Vh.shape, (3, 3))\n self.assertTrue(np.all(s >= 0))", "apis": ["scipy.linalg.svd", "numpy.random", "numpy.random.seed", "numpy.random.rand"], "libs": ["numpy", "scipy"], "doc": {"description": ["Generate a matrix of random values with specified dimensions and perform Singular Value Decomposition (SVD) on it."], "note": [], "params": ["rows (int): Number of rows for the random matrix. Default is 3.", "columns (int): Number of columns for the random matrix. Default is 2.", "seed (int, optional): Seed for the random number generator to ensure reproducibility. Default is None."], "returns": ["tuple: A tuple containing three elements:", "U (ndarray): The unitary matrix U.", "s (ndarray): The singular values, sorted in descending order.", "Vh (ndarray): The conjugate transpose of the unitary matrix V."], "reqs": ["numpy", "scipy.linalg.svd"], "raises": [], "example": [">>> U, s, Vh = f_793(3, 2, seed=42)", ">>> print('U shape:', U.shape)", "U shape: (3, 3)", ">>> print('s shape:', s.shape)", "s shape: (2,)", ">>> print('Vh shape:', Vh.shape)", "Vh shape: (2, 2)"]}} +{"task_id": "f_772", "prompt": "import random\nimport string\n\ndef f_772(word):\n \"\"\"\n Generates a list of random pairs of adjacent letters from the given word. The number of such pairs will be equal to the length of the constant POSSIBLE_LETTERS.\n \n Parameters:\n word (str): The input string. Must only contain letters.\n \n Returns:\n list: A list of random pairs of adjacent letters from the word. If the word has fewer than 2 letters, returns a list of empty strings based on POSSIBLE_LETTERS length.\n \n Examples:\n >>> random.seed(0); f_772('abcdef')\n ['ab', 'bc', 'cd']\n >>> random.seed(0); f_772('xyz')\n ['xy', 'xy', 'yz']\n \"\"\"", "canonical_solution": " if not all(char in string.ascii_letters for char in word):\n raise ValueError(\"Input must only contain letters.\")\n \n if len(word) < 2:\n return ['' for _ in range(len(['a', 'b', 'c']))]\n \n pairs = [''.join(x) for x in zip(word, word[1:])]\n random_pairs = [random.choice(pairs) for _ in range(len(['a', 'b', 'c']))]\n\n return random_pairs", "test": "import unittest\nimport random\n# Assuming the function is correctly imported from its script\n# from f_772 import f_772 \nclass TestCases(unittest.TestCase):\n def test_with_valid_input(self):\n random.seed(0)\n result = f_772('abcdef')\n self.assertEqual(len(result), 3, \"Output list should have length 3\")\n valid_pairs = ['ab', 'bc', 'cd', 'de', 'ef']\n for pair in result:\n self.assertIn(pair, valid_pairs, f\"Pair '{pair}' is not a valid adjacent pair in 'abcdef'\")\n def test_single_character(self):\n random.seed(42)\n result = f_772('a')\n expected = ['', '', '']\n self.assertEqual(result, expected, \"Should return list of empty strings for a single character\")\n def test_empty_string(self):\n random.seed(55)\n result = f_772('')\n expected = ['', '', '']\n self.assertEqual(result, expected, \"Should return list of empty strings for an empty string\")\n def test_non_letter_input(self):\n random.seed(0)\n with self.assertRaises(ValueError):\n f_772('123')\n def test_long_input(self):\n random.seed(5)\n result = f_772('abcdefghijklmnopqrstuvwxyz')\n all_pairs = [''.join(x) for x in zip('abcdefghijklmnopqrstuvwxyz', 'abcdefghijklmnopqrstuvwxyz'[1:])]\n for pair in result:\n self.assertIn(pair, all_pairs, f\"Pair '{pair}' is not a valid adjacent pair in the alphabet\")", "apis": ["string.ascii_letters", "random.choice"], "libs": ["random", "string"], "doc": {"description": ["Generates a list of random pairs of adjacent letters from the given word. The number of such pairs will be equal to the length of the constant POSSIBLE_LETTERS."], "note": [], "params": ["word (str): The input string. Must only contain letters."], "returns": ["list: A list of random pairs of adjacent letters from the word. If the word has fewer than 2 letters, returns a list of empty strings based on POSSIBLE_LETTERS length."], "reqs": [], "raises": [], "example": ["Examples:", ">>> random.seed(0); f_772('abcdef')", "['ab', 'bc', 'cd']", ">>> random.seed(0); f_772('xyz')", "['xy', 'xy', 'yz']"]}} +{"task_id": "f_919", "prompt": "import datetime\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Constants\nTIME_FORMAT = \"%d/%m/%y %H:%M:%S.%f\"\n\n\ndef f_919(time_strings):\n \"\"\"\n Compute the differences in seconds between consecutive datetime strings and plot these differences as a bar chart.\n\n Parameters:\n - time_strings (list of str): A list of datetime strings in the format 'dd/mm/yy HH:MM:SS.fff'.\n\n Returns:\n - matplotlib.axes.Axes: The axes object of the plotted bar chart. This object allows further customization of the plot outside this function.\n\n Requirements:\n - datetime\n - numpy\n - matplotlib\n\n Note:\n - The function requires the datetime, numpy, and matplotlib.pyplot modules.\n - The datetime strings in the input list should follow the specific format specified in TIME_FORMAT.\n - The function calculates the time differences between each pair of consecutive datetime strings in the list.\n\n Example:\n >>> time_strings = ['30/03/09 16:31:32.123', '30/03/09 16:32:33.123', '30/03/09 16:33:34.123']\n >>> ax = f_919(time_strings)\n >>> plt.show() # This will display the bar chart\n \"\"\"", "canonical_solution": " # Calculate time differences\n differences = (\n np.diff([datetime.datetime.strptime(t, TIME_FORMAT) for t in time_strings])\n .astype(\"timedelta64[s]\")\n .astype(int)\n )\n\n # Plotting the bar chart\n _ = plt.bar(range(len(differences)), differences)\n plt.xlabel(\"Index\")\n plt.ylabel(\"Time Difference (seconds)\")\n plt.title(\"Time Differences Between Consecutive Timestamps\")\n return plt.gca()", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_919\"\"\"\n def test_regular_time_strings(self):\n \"\"\"Test Regular Time Strings with 1-second difference\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:31:33.123\",\n \"30/03/09 16:31:34.123\",\n ]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n self.assertEqual(bar_heights, [1.0, 1.0])\n def test_different_time_units(self):\n \"\"\"Test Time Strings with Different Day, Hour, Minute, and Second Differences\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"31/03/09 17:32:33.123\",\n \"01/04/09 18:33:34.123\",\n ]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n expected_diffs = [(86400 + 3600 + 60 + 1), (86400 + 3600 + 60 + 1)]\n self.assertEqual(bar_heights, expected_diffs)\n def test_millisecond_difference(self):\n \"\"\"Test Time Strings with Millisecond Differences\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:31:32.623\",\n \"30/03/09 16:31:33.123\",\n ]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n self.assertEqual(bar_heights, [0, 0])\n def test_no_difference(self):\n \"\"\"Test Time Strings with No Difference\"\"\"\n time_strings = [\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:31:32.123\",\n \"30/03/09 16:31:32.123\",\n ]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n self.assertEqual(bar_heights, [0.0, 0.0])\n def test_large_list(self):\n \"\"\"Test Large List of Time Strings with Constant 1-second Difference\"\"\"\n time_strings = [\"30/03/09 16:31:\" + f\"{i:02}.123\" for i in range(30, 40)]\n ax = f_919(time_strings)\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n plt.close()\n self.assertEqual(bar_heights, [1.0] * 9)", "apis": ["matplotlib.pyplot.bar", "numpy.diff", "datetime.datetime.strptime", "matplotlib.pyplot.title", "datetime.datetime", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.gca", "matplotlib.pyplot.xlabel"], "libs": ["numpy", "matplotlib", "datetime"], "doc": {"description": ["Compute the differences in seconds between consecutive datetime strings and plot these differences as a bar chart."], "note": ["The function requires the datetime, numpy, and matplotlib.pyplot modules.", "The datetime strings in the input list should follow the specific format specified in TIME_FORMAT.", "The function calculates the time differences between each pair of consecutive datetime strings in the list."], "params": ["time_strings (list of str): A list of datetime strings in the format 'dd/mm/yy HH:MM:SS.fff'."], "returns": ["matplotlib.axes.Axes: The axes object of the plotted bar chart. This object allows further customization of the plot outside this function."], "reqs": ["datetime", "numpy", "matplotlib"], "raises": [], "example": [">>> time_strings = ['30/03/09 16:31:32.123', '30/03/09 16:32:33.123', '30/03/09 16:33:34.123']", ">>> ax = f_919(time_strings)", ">>> plt.show() # This will display the bar chart"]}} +{"task_id": "f_355", "prompt": "from scipy.spatial.distance import cdist\nfrom sklearn.datasets import make_blobs\nimport matplotlib.pyplot as plt\n\n\ndef f_355(n_samples=200, centers=4, plot_path=None, random_seed=None):\n \"\"\"\n Generate a synthetic 2D dataset using make_blobs, visualize the dataset, and then calculate\n the Euclidean distance between individual samples of the dataset.\n\n Parameters:\n - n_samples (int): Number of samples to generate. Default is 200.\n - centers (int): Number of centers to generate. Default is 4.\n - plot_path (str, optional): Path to save the plot. If None, the plot will be returned.\n - random_seed (int, optional): Seed for random number generation. Default is None.\n\n Returns:\n - tuple:\n - ndarray: A 2D array with distances between each sample.\n - Axes or None: If plot_path is None, returns the matplotlib Axes object of the plot.\n Otherwise, saves the plot to the provided path and return None.\n Plot shows values of the first feature dimension on the x-axis, values\n of the second feature dimension on the y-axis, and labels of the synthetic\n examples as color.\n\n Requirements:\n - scipy.spatial.distance.cdist\n - sklearn.datasets.make_blobs\n - matplotlib.pyplot\n\n Example:\n >>> distances, plot = f_355(random_seed=42)\n >>> distances.shape\n (200, 200)\n >>> plot\n \n \"\"\"", "canonical_solution": " X, y = make_blobs(\n n_samples=n_samples,\n n_features=2,\n centers=centers,\n random_state=random_seed,\n )\n\n fig, ax = plt.subplots()\n\n ax.scatter(X[:, 0], X[:, 1], c=y)\n\n if plot_path:\n plt.savefig(plot_path)\n plt.close(fig)\n return cdist(X, X), None\n\n return cdist(X, X), ax", "test": "import unittest\nimport tempfile\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.seed = 42\n self.temp_dir = tempfile.TemporaryDirectory()\n def test_case_1(self):\n # Default parameters\n distances, plot = f_355()\n self.assertEqual(distances.shape, (200, 200))\n self.assertEqual(len(plot.collections[0].get_offsets()), 200)\n self.assertEqual(len(set(plot.collections[0].get_array())), 4)\n def test_case_2(self):\n # Custom parameters\n n_samples, centers = 50, 5\n distances, plot = f_355(\n random_seed=self.seed, n_samples=n_samples, centers=centers\n )\n self.assertEqual(distances.shape, (n_samples, n_samples))\n self.assertEqual(len(plot.collections[0].get_offsets()), n_samples)\n self.assertEqual(len(set(plot.collections[0].get_array())), centers)\n def test_case_3(self):\n # Saving the plot to a path\n plot_path = os.path.join(self.temp_dir.name, \"test_plot.png\")\n distances, plot = f_355(random_seed=self.seed, plot_path=plot_path)\n self.assertEqual(distances.shape, (200, 200))\n self.assertTrue(os.path.exists(plot_path))\n self.assertIsNone(plot)\n def test_case_4(self):\n # Test reproducibility with the same seed\n distances1, _ = f_355(random_seed=self.seed)\n distances2, _ = f_355(random_seed=self.seed)\n np.testing.assert_array_equal(distances1, distances2)\n # Test different outputs with different seeds\n distances3, _ = f_355(random_seed=43)\n with self.assertRaises(AssertionError):\n np.testing.assert_array_equal(distances1, distances3)\n def test_case_5(self):\n # Test negative parameters for n_samples\n with self.assertRaises(ValueError):\n f_355(n_samples=-100, random_seed=self.seed)\n def test_case_6(self):\n # Test non-integer inputs for n_samples\n with self.assertRaises(TypeError):\n f_355(n_samples=200.5, random_seed=self.seed)\n def tearDown(self):\n plt.close(\"all\")\n self.temp_dir.cleanup()", "apis": ["sklearn.datasets.make_blobs", "scipy.spatial.distance.cdist", "matplotlib.pyplot.savefig", "matplotlib.pyplot.close", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "sklearn", "scipy"], "doc": {"description": ["Generate a synthetic 2D dataset using make_blobs, visualize the dataset, and then calculate", "the Euclidean distance between individual samples of the dataset."], "note": [], "params": ["n_samples (int): Number of samples to generate. Default is 200.", "centers (int): Number of centers to generate. Default is 4.", "plot_path (str, optional): Path to save the plot. If None, the plot will be returned.", "random_seed (int, optional): Seed for random number generation. Default is None."], "returns": ["tuple:", "ndarray: A 2D array with distances between each sample.", "Axes or None: If plot_path is None, returns the matplotlib Axes object of the plot.", "Otherwise, saves the plot to the provided path and return None.", "Plot shows values of the first feature dimension on the x-axis, values", "of the second feature dimension on the y-axis, and labels of the synthetic", "examples as color."], "reqs": ["scipy.spatial.distance.cdist", "sklearn.datasets.make_blobs", "matplotlib.pyplot"], "raises": [], "example": [">>> distances, plot = f_355(random_seed=42)", ">>> distances.shape", "(200, 200)", ">>> plot", ""]}} +{"task_id": "f_910", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n# Constants\nNUM_SAMPLES = 100\nNUM_OUTLIERS = 5\n\n\ndef f_910(num_samples=NUM_SAMPLES, num_outliers=NUM_OUTLIERS):\n \"\"\"\n Generate a dataset comprising both normal data and artificially introduced outliers,\n and plot a histogram of the combined data. The function detects outliers in the dataset\n using the Interquartile Range (IQR) method, but it only considers the normally distributed\n portion of the data for outlier detection. The outliers detected and the artificially\n introduced outliers might not always coincide.\n\n Parameters:\n - num_samples (int): Number of samples to be drawn from a normal distribution. The default \n value is 100. If set to zero or a negative number, no normal data will be generated, \n and the dataset will only contain artificially introduced outliers.\n - num_outliers (int): Number of outliers to be artificially introduced into the dataset. \n These outliers are uniformly distributed between -10 and 10. The default value is 5. \n If set to zero, no outliers will be artificially introduced.\n\n\n Returns:\n - data (numpy array): The combined dataset, including both normally distributed data and \n the artificially introduced outliers.\n - outliers_detected (numpy array): The outliers detected using the IQR method. This \n detection is based solely on the normally distributed portion of the data.\n - ax (matplotlib.axes._subplots.Axes): The Axes object for the histogram \n plot of the combined dataset.\n\n Requirements:\n - numpy\n - matplotlib\n\n Note:\n - The artificially introduced outliers are not necessarily the same as the outliers\n detected by the IQR method. The IQR method is applied only to the normally distributed\n data, and thus some of the artificially introduced outliers may not be detected,\n and some normal data points may be falsely identified as outliers.\n\n Example:\n >>> import numpy as np\n >>> np.random.seed(0)\n >>> data, outliers_detected, ax = f_910()\n >>> print(outliers_detected)\n [-9.61613603 -3.96850367 3.20347075]\n \"\"\"", "canonical_solution": " normal_data = np.random.normal(size=num_samples)\n outliers = np.random.uniform(low=-10, high=10, size=num_outliers)\n data = np.concatenate([normal_data, outliers]) if num_samples > 0 else outliers\n\n # Identify outliers using IQR (only if there is normal data)\n outliers_detected = np.array([])\n if num_samples > 0:\n q75, q25 = np.percentile(normal_data, [75, 25])\n iqr = q75 - q25\n lower_bound = q25 - (iqr * 1.5)\n upper_bound = q75 + (iqr * 1.5)\n outliers_detected = data[(data < lower_bound) | (data > upper_bound)]\n\n # Plot histogram\n _, ax = plt.subplots()\n ax.hist(data, bins=30)\n\n return data, outliers_detected, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_910.\"\"\"\n def test_default_values(self):\n \"\"\"Test the function with default values.\"\"\"\n np.random.seed(0)\n data, _, _ = f_910()\n self.assertEqual(len(data), 105)\n def test_custom_values(self):\n \"\"\"Test the function with custom values.\"\"\"\n np.random.seed(1)\n data, outliers_detected, _ = f_910(num_samples=50, num_outliers=10)\n self.assertEqual(len(data), 60)\n # Replicate the IQR calculation for testing\n normal_data = data[:50] # Assuming the first 50 are normal data\n q75, q25 = np.percentile(normal_data, [75, 25])\n iqr = q75 - q25\n lower_bound = q25 - (iqr * 1.5)\n upper_bound = q75 + (iqr * 1.5)\n expected_outliers_count = len(\n [o for o in data if o < lower_bound or o > upper_bound]\n )\n self.assertEqual(len(outliers_detected), expected_outliers_count)\n def test_no_outliers(self):\n \"\"\"Test the function with no outliers.\"\"\"\n np.random.seed(2)\n data, outliers_detected, ax = f_910(num_samples=100, num_outliers=0)\n self.assertEqual(len(data), 100)\n # Adjust the expectation to consider possible false positives\n self.assertTrue(len(outliers_detected) <= 1) # Allow for up to 1 false positive\n def test_only_outliers(self):\n \"\"\"Test the function with only outliers.\"\"\"\n np.random.seed(3)\n data, outliers_detected, _ = f_910(num_samples=0, num_outliers=100)\n self.assertEqual(len(data), 100)\n # Since no normal data is generated, IQR is not applied, and no outliers are detected.\n self.assertEqual(len(outliers_detected), 0)\n def test_negative_values(self):\n \"\"\"Test the function with negative values.\"\"\"\n np.random.seed(4)\n with self.assertRaises(ValueError):\n f_910(num_samples=-10, num_outliers=-5)\n def tearDown(self):\n plt.close()", "apis": ["numpy.percentile", "numpy.random", "numpy.random.uniform", "numpy.array", "matplotlib.pyplot.subplots", "numpy.concatenate", "numpy.random.normal"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Generate a dataset comprising both normal data and artificially introduced outliers,", "and plot a histogram of the combined data. The function detects outliers in the dataset", "using the Interquartile Range (IQR) method, but it only considers the normally distributed", "portion of the data for outlier detection. The outliers detected and the artificially", "introduced outliers might not always coincide."], "note": ["The artificially introduced outliers are not necessarily the same as the outliers", "detected by the IQR method. The IQR method is applied only to the normally distributed", "data, and thus some of the artificially introduced outliers may not be detected,", "and some normal data points may be falsely identified as outliers."], "params": ["num_samples (int): Number of samples to be drawn from a normal distribution. The default", "value is 100. If set to zero or a negative number, no normal data will be generated,", "and the dataset will only contain artificially introduced outliers.", "num_outliers (int): Number of outliers to be artificially introduced into the dataset.", "These outliers are uniformly distributed between -10 and 10. The default value is 5.", "If set to zero, no outliers will be artificially introduced."], "returns": ["data (numpy array): The combined dataset, including both normally distributed data and", "the artificially introduced outliers.", "outliers_detected (numpy array): The outliers detected using the IQR method. This", "detection is based solely on the normally distributed portion of the data.", "ax (matplotlib.axes._subplots.Axes): The Axes object for the histogram", "plot of the combined dataset."], "reqs": ["numpy", "matplotlib"], "raises": [], "example": [">>> import numpy as np", ">>> np.random.seed(0)", ">>> data, outliers_detected, ax = f_910()", ">>> print(outliers_detected)", "[-9.61613603 -3.96850367 3.20347075]"]}} +{"task_id": "f_849", "prompt": "import os\nimport requests\nfrom zipfile import ZipFile, BadZipFile\n\n\ndef f_849(url, download_path=\"mnt/data/downloads/\"):\n \"\"\"\n Downloads and extracts a ZIP file from a specified URL to a given directory.\n\n Parameters:\n - url (str): The URL from which to download the ZIP file. It should be a valid and accessible URL.\n - download_path (str): The directory path where the ZIP file will be downloaded and extracted.\n Defaults to \"mnt/data/downloads/\".\n\n Returns:\n - str: Path to the directory containing the extracted contents. If an error occurs, a descriptive\n message detailing the type of error is returned.\n\n Raises:\n - Network Issues or Invalid URL: Returns \"Error: Unable to download the file from the provided URL.\"\n if there are issues in reaching the URL or downloading the file.\n - Incorrect File Type: Returns \"Error: The URL does not point to a ZIP file.\" if the downloaded file's\n content type is not 'application/zip'.\n - Corrupt ZIP File: Returns \"Error: The downloaded file is not a valid ZIP file.\" if the downloaded file\n is a ZIP file but is corrupt or cannot be extracted.\n - General Exceptions: Catches and reports any other exceptions (like runtime errors) that occur during\n the process with a specific error message, formatted as \"Error: [exception message]\".\n\n\n Requirements:\n - requests\n - os\n - zipfile\n\n Example:\n >>> f_849('https://example.com/file.zip')\n 'mnt/data/downloads/file'\n \"\"\"", "canonical_solution": " if not os.path.exists(download_path):\n os.makedirs(download_path)\n\n try:\n response = requests.get(url, timeout=5)\n response.raise_for_status()\n\n # Verify content type\n if \"application/zip\" not in response.headers.get(\"Content-Type\", \"\"):\n return \"Error: The URL does not point to a ZIP file.\"\n\n file_name = os.path.join(download_path, os.path.basename(url))\n\n with open(file_name, \"wb\") as f:\n f.write(response.content)\n\n extract_path = os.path.splitext(file_name)[0]\n\n if not os.path.exists(extract_path):\n os.makedirs(extract_path)\n\n with ZipFile(file_name, \"r\") as zip_ref:\n zip_ref.extractall(extract_path)\n\n return extract_path\n\n except requests.RequestException:\n return \"Error: Unable to download the file from the provided URL.\"\n except BadZipFile:\n return \"Error: The downloaded file is not a valid ZIP file.\"\n except RuntimeError as e:\n return f\"Error: {str(e)}\"", "test": "import unittest\nfrom unittest.mock import patch\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_849.\"\"\"\n def test_valid_zip_url(self):\n \"\"\"Test a valid ZIP URL.\"\"\"\n url = \"https://getsamplefiles.com/download/zip/sample-1.zip\"\n result = f_849(url)\n self.assertTrue(result.startswith(\"mnt/data/downloads/\"))\n self.assertTrue(result.endswith(\"sample-1\"))\n shutil.rmtree(\"mnt/data/downloads\")\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"Test an invalid URL.\"\"\"\n mock_get.side_effect = requests.RequestException()\n url = \"https://invalid-url.com/sample.zip\"\n result = f_849(url)\n self.assertEqual(\n result,\n \"Error: Unable to download the file from the provided URL.\",\n )\n @patch(\"requests.get\")\n def test_non_zip_content(self, mock_get):\n \"\"\"Test a URL that does not point to a ZIP file.\"\"\"\n mock_get.return_value.status_code = 200\n mock_get.return_value.headers = {\"Content-Type\": \"text/plain\"}\n mock_get.return_value.content = b\"Not a ZIP file\"\n url = \"https://valid-url.com/not-a-zip.txt\"\n result = f_849(url)\n self.assertEqual(result, \"Error: The URL does not point to a ZIP file.\")\n @patch(\"requests.get\")\n def test_download_invald_zip_file(self, mock_get):\n \"\"\"Test a URL that points to a ZIP file, but the file is invalid.\"\"\"\n mock_get.return_value.status_code = 200\n mock_get.return_value.headers = {\"Content-Type\": \"application/zip\"}\n mock_get.return_value.content = b\"Some ZIP content\"\n url = \"https://valid-zip-url.com/sample.zip\"\n custom_path = \"mnt/data/custom_path/\"\n result = f_849(url, custom_path)\n self.assertEqual(result, \"Error: The downloaded file is not a valid ZIP file.\")\n @patch(\"requests.get\")\n def test_general_error(self, mock_get):\n \"\"\"Test a general error.\"\"\"\n mock_get.side_effect = RuntimeError(\"Unexpected error\")\n url = \"https://error-url.com/error.zip\"\n result = f_849(url)\n self.assertTrue(result.startswith(\"Error: Unexpected error\"))\n @classmethod\n def tearDownClass(cls):\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["zipfile.ZipFile", "requests.get", "os.path.splitext", "requests.RequestException", "os.makedirs", "os.path", "os.path.join", "os.path.basename", "os.path.exists"], "libs": ["zipfile", "os", "requests"], "doc": {"description": ["Downloads and extracts a ZIP file from a specified URL to a given directory."], "note": [], "params": ["url (str): The URL from which to download the ZIP file. It should be a valid and accessible URL.", "download_path (str): The directory path where the ZIP file will be downloaded and extracted.", "Defaults to \"mnt/data/downloads/\"."], "returns": ["str: Path to the directory containing the extracted contents. If an error occurs, a descriptive", "message detailing the type of error is returned."], "reqs": ["requests", "os", "zipfile"], "raises": ["Network Issues or Invalid URL: Returns \"Error: Unable to download the file from the provided URL.\"", "if there are issues in reaching the URL or downloading the file.", "Incorrect File Type: Returns \"Error: The URL does not point to a ZIP file.\" if the downloaded file's", "content type is not 'application/zip'.", "Corrupt ZIP File: Returns \"Error: The downloaded file is not a valid ZIP file.\" if the downloaded file", "is a ZIP file but is corrupt or cannot be extracted.", "General Exceptions: Catches and reports any other exceptions (like runtime errors) that occur during", "the process with a specific error message, formatted as \"Error: [exception message]\"."], "example": [">>> f_849('https://example.com/file.zip')", "'mnt/data/downloads/file'"]}} +{"task_id": "f_565", "prompt": "import numpy as np\nfrom sklearn.decomposition import PCA\n\ndef f_565(tuples_list, n_components):\n \"\"\"\n Perform Principal Component Analysis (PCA) on a list of tuples.\n \n Parameters:\n - tuples_list (list): The list of tuples.\n \n Returns:\n - transformed_data (ndarray): The transformed data.\n\n Requirements:\n - numpy\n - sklearn\n \n Example:\n >>> data = f_565([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], 2)\n >>> print(data)\n [[ 8.00000000e+00 3.84592537e-16]\n [ 0.00000000e+00 0.00000000e+00]\n [-8.00000000e+00 3.84592537e-16]]\n \"\"\"", "canonical_solution": " data = np.array(tuples_list)\n pca = PCA(n_components=n_components)\n transformed_data = pca.fit_transform(data)\n\n return transformed_data", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n transformed_data = f_565([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], 2)\n self.assertEqual(transformed_data.shape, (3, 2))\n def test_case_2(self):\n transformed_data = f_565([(0, 0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0)], 2)\n self.assertEqual(transformed_data.shape, (3, 2))\n self.assertTrue(np.all(transformed_data == 0))\n def test_case_3(self):\n transformed_data = f_565([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], 3)\n self.assertEqual(transformed_data.shape, (3, 3))\n def test_case_4(self):\n transformed_data = f_565([(0, 1)], 1)\n self.assertEqual(transformed_data.shape, (1, 1))\n self.assertTrue(np.all(transformed_data == 0))\n def test_case_5(self):\n transformed_data = f_565([(-1, -1, -1), (0, 0, 0), (1, 1, 1)], 1)\n self.assertEqual(transformed_data.shape, (3, 1))\n self.assertTrue(transformed_data[0][0] < 0)\n self.assertTrue(transformed_data[1][0] == 0)\n self.assertTrue(transformed_data[2][0] > 0)", "apis": ["sklearn.decomposition.PCA", "numpy.array"], "libs": ["numpy", "sklearn"], "doc": {"description": ["Perform Principal Component Analysis (PCA) on a list of tuples."], "note": [], "params": ["tuples_list (list): The list of tuples."], "returns": ["transformed_data (ndarray): The transformed data."], "reqs": ["numpy", "sklearn"], "raises": [], "example": [">>> data = f_565([(1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12)], 2)", ">>> print(data)", "[[ 8.00000000e+00 3.84592537e-16]", "[ 0.00000000e+00 0.00000000e+00]", "[-8.00000000e+00 3.84592537e-16]]"]}} +{"task_id": "f_807", "prompt": "import os\nfrom pathlib import Path\nimport pandas as pd\nimport docx\n\n\ndef f_807(source_directory: str, target_directory: str) -> int:\n \"\"\"\n Converts files with specific extensions (.txt, .docx, .xlsx, .csv) from a source directory to CSV files\n and saves them in a target directory.\n\n Parameters:\n - source_directory (str): The path to the source directory containing the files to be converted.\n - target_directory (str): The path to the target directory where the converted CSV files will be saved.\n If it does not exist, the function will create it.\n\n Returns:\n - int: The number of files successfully converted to CSV.\n\n Raises:\n - FileNotFoundError: If the source directory does not exist.\n\n Requirements:\n - os\n - pathlib\n - pandas\n - python-docx\n - openpyxl\n\n Notes:\n - Each file's text content is captured and stored in a CSV with a single 'Text' column and no row indices.\n - This function will overwrite existing files in the target directory if they have the same names as the\n converted files.\n\n Example:\n >>> f_807('/Users/test/Documents', '/Users/test/Documents/csv_files')\n 4\n >>> f_807('/path/to/source', '/path/to/target')\n 2\n \"\"\"", "canonical_solution": " converted_files = 0\n extensions = [\".txt\", \".docx\", \".xlsx\", \".csv\"]\n\n if not os.path.exists(source_directory):\n raise FileNotFoundError(\"source_directory must exist.\")\n if not os.path.exists(target_directory):\n os.makedirs(target_directory, exist_ok=True)\n\n for root, dirs, files in os.walk(source_directory):\n for file in files:\n extension = Path(file).suffix\n if extension in extensions:\n filepath = os.path.join(root, file)\n target_filepath = os.path.join(\n target_directory, Path(file).stem + \".csv\"\n )\n if extension == \".csv\":\n df = pd.read_csv(filepath)\n elif extension == \".xlsx\":\n df = pd.read_excel(filepath, engine=\"openpyxl\")\n elif extension == \".docx\":\n doc = docx.Document(filepath)\n data = [p.text for p in doc.paragraphs]\n df = pd.DataFrame({\"Text\": data})\n elif extension == \".txt\":\n with open(filepath, \"r\") as f:\n data = f.readlines()\n df = pd.DataFrame({\"Text\": data})\n\n df.to_csv(target_filepath, index=False)\n converted_files += 1\n\n return converted_files", "test": "import unittest\nimport os\nimport docx\nimport pandas as pd\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_source_dir = tempfile.TemporaryDirectory()\n self.temp_target_dir = tempfile.TemporaryDirectory()\n self.source_dir = self.temp_source_dir.name\n self.target_dir = self.temp_target_dir.name\n self.test_texts = [\"Hello, world!\"] * 10\n self.test_df = pd.DataFrame(\n {\"A\": list(range(10)), \"B\": [str(_) for _ in range(10)]}\n )\n def tearDown(self):\n self.temp_source_dir.cleanup()\n self.temp_target_dir.cleanup()\n def create_test_data(self, extension):\n filename = \"sample\" + extension\n path = os.path.join(self.source_dir, filename)\n if extension == \".txt\":\n with open(path, \"w\") as f:\n for text in self.test_texts:\n f.write(text + \"\\n\")\n elif extension == \".docx\":\n doc = docx.Document()\n for text in self.test_texts:\n doc.add_paragraph(text)\n doc.save(path)\n elif extension == \".csv\":\n self.test_df.to_csv(path, index=False)\n elif extension == \".xlsx\":\n self.test_df.to_excel(path, index=False)\n def test_case_1(self):\n # Test txt\n self.create_test_data(\".txt\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)\n converted_path = os.path.join(self.target_dir, \"sample.csv\")\n self.assertTrue(os.path.exists(converted_path))\n def test_case_2(self):\n # Test docx\n self.create_test_data(\".docx\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)\n self.assertTrue(os.path.exists(os.path.join(self.target_dir, \"sample.csv\")))\n def test_case_3(self):\n # Test xlsx\n self.create_test_data(\".xlsx\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)\n self.assertTrue(os.path.exists(os.path.join(self.target_dir, \"sample.csv\")))\n def test_case_4(self):\n # Test csv\n self.create_test_data(\".csv\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)\n self.assertTrue(os.path.exists(os.path.join(self.target_dir, \"sample.csv\")))\n def test_case_5(self):\n # Ensure function handles directories without convertible files\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 0)\n def test_case_6(self):\n # Test with a source directory that does not exist\n non_existent_dir = \"/path/does/not/exist\"\n with self.assertRaises(FileNotFoundError):\n f_807(non_existent_dir, self.target_dir)\n def test_case_7(self):\n # Ensure function does not convert unsupported file types\n unsupported_path = os.path.join(self.source_dir, \"unsupported.pdf\")\n open(unsupported_path, \"a\").close()\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 0)\n def test_case_8(self):\n # Create multiple files of supported types and verify they all get converted\n for ext in [\".txt\", \".docx\", \".xlsx\", \".csv\"]:\n self.create_test_data(ext)\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 4)\n def test_case_9(self):\n # Ensure function can handle files in subdirectories of the source directory\n sub_dir = os.path.join(self.source_dir, \"subdir\")\n os.makedirs(sub_dir)\n txt_path = os.path.join(sub_dir, \"sample.txt\")\n with open(txt_path, \"w\") as f:\n f.write(\"Hello, nested world!\")\n num_converted = f_807(self.source_dir, self.target_dir)\n self.assertEqual(num_converted, 1)", "apis": ["pandas.read_csv", "pandas.read_excel", "pandas.DataFrame", "os.makedirs", "os.walk", "docx.Document", "os.path", "pathlib.Path", "os.path.join", "os.path.exists"], "libs": ["os", "pandas", "pathlib", "docx"], "doc": {"description": ["Converts files with specific extensions (.txt, .docx, .xlsx, .csv) from a source directory to CSV files", "and saves them in a target directory.", "Notes:", "- Each file's text content is captured and stored in a CSV with a single 'Text' column and no row indices.", "- This function will overwrite existing files in the target directory if they have the same names as the", "converted files."], "note": [], "params": ["source_directory (str): The path to the source directory containing the files to be converted.", "target_directory (str): The path to the target directory where the converted CSV files will be saved.", "If it does not exist, the function will create it."], "returns": ["int: The number of files successfully converted to CSV."], "reqs": ["os", "pathlib", "pandas", "python-docx", "openpyxl"], "raises": ["FileNotFoundError: If the source directory does not exist."], "example": [">>> f_807('/Users/test/Documents', '/Users/test/Documents/csv_files')", "4", ">>> f_807('/path/to/source', '/path/to/target')", "2"]}} +{"task_id": "f_786", "prompt": "import pandas as pd\nimport numpy as np\nfrom statsmodels.tsa.seasonal import seasonal_decompose\nimport random \n\ndef f_786(start_date='2016-01-01', periods=24, freq='M', model='additive'):\n \"\"\"\n Generate a sales time-series and decompose it into trend, seasonal, and residual components.\n \n Parameters:\n - start_date (str): The start date of the time-series in the format 'YYYY-MM-DD'. Default is '2016-01-01'.\n - periods (int): The number of periods to generate for the time-series. Default is 24.\n - freq (str): The frequency of the time-series data. Default is 'M' (Monthly End).\n - model (str): The type of seasonal decomposition ('additive' or 'multiplicative'). Default is 'additive'.\n\n Returns:\n - A dictionary containing 'trend', 'seasonal', and 'residual' components as Pandas Series.\n\n Examples:\n >>> result = f_786('2016-01-01', 24, 'M')\n >>> all(key in result for key in ['trend', 'seasonal', 'residual'])\n True\n\n >>> result = f_786('2020-01-01', 24, 'M', 'multiplicative')\n >>> len(result['seasonal'])\n 24\n \"\"\"", "canonical_solution": " date_range = pd.date_range(start=start_date, periods=periods, freq=freq)\n sales_data = np.random.randint(low=100, high=500, size=periods)\n sales_series = pd.Series(sales_data, index=date_range)\n try:\n decomposition = seasonal_decompose(sales_series, model=model, period=12 if freq == 'M' else 4)\n except ValueError as e:\n return {'error': str(e)}\n \n return {\n 'trend': decomposition.trend,\n 'seasonal': decomposition.seasonal,\n 'residual': decomposition.resid\n }", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_default_parameters(self):\n random.seed(42) # For reproducibility\n result = f_786(periods=24) # Adjust to meet the minimum requirement for decomposition\n self.assertTrue(all(key in result for key in ['trend', 'seasonal', 'residual']))\n def test_multiplicative_model(self):\n random.seed(0) # For reproducibility\n result = f_786('2020-01-01', 24, 'M', 'multiplicative')\n self.assertTrue(all(key in result for key in ['trend', 'seasonal', 'residual']))\n def test_custom_parameters(self):\n random.seed(55) # For reproducibility\n result = f_786('2017-01-01', 36, 'M')\n self.assertEqual(len(result['trend']), 36)\n def test_weekly_frequency(self):\n random.seed(1) # For reproducibility\n result = f_786('2022-01-01', 104, 'W', 'additive')\n self.assertTrue(all(key in result for key in ['trend', 'seasonal', 'residual']))\n self.assertEqual(len(result['seasonal']), 104)\n \n def test_insufficient_periods_error(self):\n random.seed(66) # For reproducibility\n result = f_786('2022-01-01', 12, 'M')\n self.assertIn('error', result)\n \n def test_additive_decomposition_properties(self):\n random.seed(42) # For reproducibility\n periods = 36\n result = f_786('2020-01-01', periods, 'M')\n reconstructed = result['trend'].fillna(0) + result['seasonal'].fillna(0) + result['residual'].fillna(0)\n self.assertTrue(np.allclose(reconstructed.head(12), reconstructed.head(12), atol=1))", "apis": ["numpy.random.randint", "pandas.Series", "numpy.random", "pandas.date_range", "statsmodels.tsa.seasonal.seasonal_decompose"], "libs": ["pandas", "statsmodels", "numpy"], "doc": {"description": ["Generate a sales time-series and decompose it into trend, seasonal, and residual components.", ">>> result = f_786('2020-01-01', 24, 'M', 'multiplicative')", ">>> len(result['seasonal'])", "24"], "note": [], "params": ["start_date (str): The start date of the time-series in the format 'YYYY-MM-DD'. Default is '2016-01-01'.", "periods (int): The number of periods to generate for the time-series. Default is 24.", "freq (str): The frequency of the time-series data. Default is 'M' (Monthly End).", "model (str): The type of seasonal decomposition ('additive' or 'multiplicative'). Default is 'additive'."], "returns": ["A dictionary containing 'trend', 'seasonal', and 'residual' components as Pandas Series."], "reqs": [], "raises": [], "example": ["Examples:", ">>> result = f_786('2016-01-01', 24, 'M')", ">>> all(key in result for key in ['trend', 'seasonal', 'residual'])", "True"]}} +{"task_id": "f_853", "prompt": "import requests\nfrom PIL import Image\nimport io\n\n\ndef f_853(url):\n \"\"\"\n Fetches an image from a given URL and returns it as a PIL Image object.\n\n Parameters:\n - url (str): The URL of the image to download. It should be a valid HTTP or\n HTTPS URL pointing directly to an image file.\n\n Returns:\n - PIL.Image.Image: A PIL Image object representing the downloaded image. This\n object can be manipulated or displayed using PIL's image processing\n capabilities.\n\n Raises:\n - ValueError: This exception is raised in the following scenarios:\n - The URL is invalid or cannot be reached within the timeout period (5 seconds).\n - The response from the server is not a successful HTTP status code (i.e., not in the range 200-299).\n - The content fetched from the URL is not a valid image format that can be handled by PIL.\n\n Requirements:\n - requests\n - PIL\n - io\n\n Example:\n >>> img = f_853('https://example.com/image.jpg')\n >>> isinstance(img, Image.Image)\n True\n\n Note:\n - The function uses a timeout of 5 seconds for the HTTP request to prevent\n indefinite waiting in case of unresponsive URLs.\n - The function will not handle redirections or authentication scenarios. It\n expects a direct link to an image resource.\n \"\"\"", "canonical_solution": " try:\n response = requests.get(url, timeout=5)\n response.raise_for_status()\n image = Image.open(io.BytesIO(response.content))\n return image\n except Exception as e:\n raise ValueError(f\"Failed to retrieve image from {url}: {e}\") from e", "test": "import unittest\nfrom unittest.mock import patch\nfrom PIL import Image\nfrom pathlib import Path\nimport shutil\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_853 function.\"\"\"\n directory = \"mnt/data/f_852_data_chien\"\n @classmethod\n def setUpClass(cls):\n \"\"\"Setup method to create a sample image inr test files.\"\"\"\n # Create directory if it doesn't exist\n cls.test_dir = Path(cls.directory)\n cls.test_dir.mkdir(parents=True, exist_ok=True)\n # Create and save a sample image\n cls.sample_image_path = Path(cls.test_dir) / \"sample_image.png\"\n sample_image = Image.new(\"RGBA\", (100, 100), color=\"blue\")\n sample_image.save(cls.sample_image_path)\n @patch(\"requests.get\")\n def test_valid_image_url(self, mock_get):\n \"\"\"Test f_853 function with a valid image URL.\"\"\"\n with open(self.sample_image_path, \"rb\") as image_file:\n mock_get.return_value.content = image_file.read()\n img = f_853(\"https://www.google.com/images/srpr/logo11w.png\")\n self.assertIsInstance(img, Image.Image, \"Returned object is not a PIL Image\")\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"Test f_853 function with an invalid URL (not an image).\"\"\"\n mock_get.side_effect = ValueError(\"Invalid URL\")\n with self.assertRaises(ValueError):\n f_853(\"https://www.google.com\")\n @patch(\"requests.get\")\n def test_nonexistent_url(self, mock_get):\n \"\"\"Test f_853 function with a nonexistent URL.\"\"\"\n mock_get.side_effect = ValueError(\"Nonexistent URL\")\n with self.assertRaises(ValueError):\n f_853(\"https://example.com/nonexistent_image.jpg\")\n @patch(\"requests.get\")\n def test_image_properties(self, mock_get):\n \"\"\"Test f_853 function with a known image and check its properties.\"\"\"\n with open(self.sample_image_path, \"rb\") as image_file:\n mock_get.return_value.content = image_file.read()\n img = f_853(\"https://www.google.com/images/srpr/logo11w.png\")\n self.assertEqual(img.format, \"PNG\", \"Image format does not match expected\")\n self.assertEqual(img.size, (100, 100), \"Image size does not match expected\")\n @patch(\"requests.get\")\n def test_image_mode(self, mock_get):\n \"\"\"Test f_853 function with a known image and check its mode.\"\"\"\n with open(self.sample_image_path, \"rb\") as image_file:\n mock_get.return_value.content = image_file.read()\n img = f_853(\"https://www.google.com/images/srpr/logo11w.png\")\n self.assertEqual(img.mode, \"RGBA\", \"Image mode does not match expected\")\n @classmethod\n def tearDownClass(cls):\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["PIL.Image.open", "io.BytesIO", "requests.get"], "libs": ["PIL", "io", "requests"], "doc": {"description": ["Fetches an image from a given URL and returns it as a PIL Image object."], "note": ["The function uses a timeout of 5 seconds for the HTTP request to prevent", "indefinite waiting in case of unresponsive URLs.", "The function will not handle redirections or authentication scenarios. It", "expects a direct link to an image resource."], "params": ["url (str): The URL of the image to download. It should be a valid HTTP or", "HTTPS URL pointing directly to an image file."], "returns": ["PIL.Image.Image: A PIL Image object representing the downloaded image. This", "object can be manipulated or displayed using PIL's image processing", "capabilities."], "reqs": ["requests", "PIL", "io"], "raises": ["ValueError: This exception is raised in the following scenarios:", "The URL is invalid or cannot be reached within the timeout period (5 seconds).", "The response from the server is not a successful HTTP status code (i.e., not in the range 200-299).", "The content fetched from the URL is not a valid image format that can be handled by PIL."], "example": [">>> img = f_853('https://example.com/image.jpg')", ">>> isinstance(img, Image.Image)", "True"]}} +{"task_id": "f_911", "prompt": "import requests\nimport logging\n\ndef f_911(repo_url: str) -> dict:\n \"\"\"\n Fetches and returns information about a GitHub repository using its API URL. The function makes an HTTP GET\n request to the provided repository URL. It incorporates error handling for various scenarios including API\n rate limits, other HTTP errors, and general request issues. The function also checks for a large number of\n open issues in the repository and prints a warning if they exceed a certain threshold.\n\n Parameters:\n - repo_url (str): The URL of the GitHub repository API.\n\n Returns:\n - dict: A dictionary containing information about the GitHub repository.\n\n Raises:\n - requests.exceptions.HTTPError: If an HTTP error occurs, particularly when the GitHub API rate limit is\n exceeded.\n - requests.exceptions.RequestException: For other general issues encountered during the API request, such\n as network problems, invalid responses, or timeouts.\n\n Requirements:\n - requests\n - logging\n\n Example:\n >>> f_911('https://api.github.com/repos/psf/requests')\n { ... } # dictionary containing repo information\n >>> f_911('https://api.github.com/repos/some/repo')\n { ... } # dictionary containing repo information with a possible runtime warning about open issues\n \"\"\"", "canonical_solution": " try:\n response = requests.get(repo_url, timeout=2)\n response.raise_for_status() # Raises HTTPError for bad requests\n repo_info = response.json()\n if (\n response.status_code == 403\n and repo_info.get(\"message\") == \"API rate limit exceeded\"\n ):\n raise requests.exceptions.HTTPError(\"API rate limit exceeded\")\n\n if repo_info.get(\"open_issues_count\", 0) > 10000:\n logging.warning(\"The repository has more than 10000 open issues.\")\n\n return repo_info\n\n except requests.exceptions.RequestException as e:\n raise requests.exceptions.RequestException(\n f\"Error fetching repo info: {e}\"\n ) from e", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nfrom io import StringIO\nfrom contextlib import redirect_stdout\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_911.\"\"\"\n @patch(\"requests.get\")\n def test_successful_response(self, mock_get):\n \"\"\"\n Test f_911 with a successful response.\n \"\"\"\n mock_get.return_value = MagicMock(\n status_code=200, json=lambda: {\"open_issues_count\": 5000}\n )\n response = f_911(\"https://api.github.com/repos/psf/requests\")\n self.assertIn(\"open_issues_count\", response)\n self.assertEqual(response[\"open_issues_count\"], 5000)\n @patch(\"requests.get\")\n @patch('logging.warning')\n def test_response_with_more_than_10000_issues(self, mock_warning, mock_get):\n \"\"\"\n Test f_911 with a response indicating more than 10000 open issues.\n \"\"\"\n mock_get.return_value = MagicMock(\n status_code=200, json=lambda: {\"open_issues_count\": 15000}\n )\n \n response = f_911(\"https://api.github.com/repos/psf/requests\")\n \n mock_warning.assert_called_once_with(\"The repository has more than 10000 open issues.\")\n self.assertEqual(response[\"open_issues_count\"], 15000)\n @patch(\"requests.get\")\n def test_api_rate_limit_exceeded(self, mock_get):\n \"\"\"\n Test f_911 handling API rate limit exceeded error.\n \"\"\"\n mock_get.return_value = MagicMock(\n status_code=403, json=lambda: {\"message\": \"API rate limit exceeded\"}\n )\n with self.assertRaises(Exception) as context:\n f_911(\"https://api.github.com/repos/psf/requests\")\n self.assertIn(\"API rate limit exceeded\", str(context.exception))\n @patch(\"requests.get\")\n def test_http_error(self, mock_get):\n \"\"\"\n Test f_911 handling HTTP errors.\n \"\"\"\n mock_get.side_effect = requests.exceptions.HTTPError(\n \"404 Client Error: Not Found for url\"\n )\n with self.assertRaises(Exception) as context:\n f_911(\"https://api.github.com/repos/psf/requests\")\n self.assertIn(\"404 Client Error\", str(context.exception))\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"\n Test f_911 with an invalid URL.\n \"\"\"\n mock_get.side_effect = requests.exceptions.InvalidURL(\"Invalid URL\")\n with self.assertRaises(Exception) as context:\n f_911(\"invalid_url\")\n self.assertIn(\"Invalid URL\", str(context.exception))", "apis": ["requests.exceptions", "requests.get", "requests.exceptions.RequestException", "logging.warning", "requests.exceptions.HTTPError"], "libs": ["logging", "requests"], "doc": {"description": ["Fetches and returns information about a GitHub repository using its API URL. The function makes an HTTP GET", "request to the provided repository URL. It incorporates error handling for various scenarios including API", "rate limits, other HTTP errors, and general request issues. The function also checks for a large number of", "open issues in the repository and prints a warning if they exceed a certain threshold."], "note": [], "params": ["repo_url (str): The URL of the GitHub repository API."], "returns": ["dict: A dictionary containing information about the GitHub repository."], "reqs": ["requests", "logging"], "raises": ["requests.exceptions.HTTPError: If an HTTP error occurs, particularly when the GitHub API rate limit is", "exceeded.", "requests.exceptions.RequestException: For other general issues encountered during the API request, such", "as network problems, invalid responses, or timeouts."], "example": [">>> f_911('https://api.github.com/repos/psf/requests')", "{ ... } # dictionary containing repo information", ">>> f_911('https://api.github.com/repos/some/repo')", "{ ... } # dictionary containing repo information with a possible runtime warning about open issues"]}} +{"task_id": "f_738", "prompt": "from collections import Counter\nimport random\nimport itertools\n\ndef f_738(length, count, seed=0):\n \"\"\"\n Generate a number of random strings with a specified length from a fixed set of letters ('a', 'b', 'c', 'd', 'e'),\n and analyze the frequency of each letter in the generated strings.\n \n Parameters:\n - length (int): The length of each string to be generated. Should be a non-negative integer.\n - count (int): The number of random strings to generate. Should be a non-negative integer.\n - seed (int, optional): A seed for the random number generator to ensure reproducibility.\n \n Requirements:\n - collections.Counter\n - random\n - itertools\n \n Returns:\n - Counter: A collections.Counter object containing the frequency of each letter in the generated strings.\n \n Example:\n >>> f_738(5, 2, seed=1)\n Counter({'a': 3, 'd': 3, 'c': 2, 'e': 1, 'b': 1})\n >>> f_738(0, 100, seed=2)\n Counter()\n \"\"\"", "canonical_solution": " random.seed(seed)\n strings = [''.join(random.choices(['a', 'b', 'c', 'd', 'e'], k=length)) for _ in range(count)]\n letter_frequency = Counter(itertools.chain(*strings))\n \n return letter_frequency", "test": "import unittest\nfrom collections import Counter\nclass TestCases(unittest.TestCase):\n def test_length_one_count_ten(self):\n result = f_738(1, 10, seed=0)\n self.assertIsInstance(result, Counter)\n self.assertEqual(sum(result.values()), 10, \"The total count of letters should be 10.\")\n \n def test_length_five_count_hundred(self):\n result = f_738(5, 100, seed=1)\n self.assertIsInstance(result, Counter)\n self.assertEqual(sum(result.values()), 500, \"The total count of letters should be 500.\")\n \n def test_zero_length(self):\n result = f_738(0, 100, seed=2)\n self.assertIsInstance(result, Counter)\n self.assertEqual(sum(result.values()), 0, \"With length 0, there should be no letters.\")\n \n def test_zero_count(self):\n result = f_738(5, 0, seed=3)\n self.assertIsInstance(result, Counter)\n self.assertEqual(sum(result.values()), 0, \"With count 0, there should be no letters.\")\n \n def test_specific_distribution(self):\n # Assuming the seed value of 4 leads to a specific, known distribution\n result = f_738(5, 2, seed=4)\n # Correct the expected distribution based on actual output\n correct_expected_distribution = Counter({'b': 3, 'a': 3, 'e': 2, 'c': 1, 'd': 1})\n self.assertEqual(result, correct_expected_distribution, \"The letter distribution should match the expected distribution.\")", "apis": ["itertools.chain", "collections.Counter", "random.seed", "random.choices"], "libs": ["itertools", "collections", "random"], "doc": {"description": ["Generate a number of random strings with a specified length from a fixed set of letters ('a', 'b', 'c', 'd', 'e'),", "and analyze the frequency of each letter in the generated strings."], "note": [], "params": ["length (int): The length of each string to be generated. Should be a non-negative integer.", "count (int): The number of random strings to generate. Should be a non-negative integer.", "seed (int, optional): A seed for the random number generator to ensure reproducibility."], "returns": ["Counter: A collections.Counter object containing the frequency of each letter in the generated strings."], "reqs": ["collections.Counter", "random", "itertools"], "raises": [], "example": [">>> f_738(5, 2, seed=1)", "Counter({'a': 3, 'd': 3, 'c': 2, 'e': 1, 'b': 1})", ">>> f_738(0, 100, seed=2)", "Counter()"]}} +{"task_id": "f_854", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_854(csv_file_path, col1_name=\"column1\", col2_name=\"column2\"):\n \"\"\"\n Reads data from a CSV file and generates a bar plot based on grouped mean values.\n\n The DataFrame is grouped by the column named 'col1_name',\n and the mean for each group is calculated for the column 'col2_name'.\n A bar plot is created using matplotlib. Each bar in the plot represents a group,\n and its height corresponds to the mean value of 'col2_name' for that group.\n The plot is then configured with a title and axis labels:\n - The title is set as \"Mean of [col2_name] Grouped by [col1_name]\".\n This format dynamically inserts the names of the columns being analyzed into the title.\n - The xlabel (label for the x-axis) is set to the name of the column used for grouping (col1_name).\n - The ylabel (label for the y-axis) is set as \"Mean of [col2_name]\",\n indicating that the y-axis represents the mean values of the specified column.\n\n Parameters:\n - csv_file_path (str): The file path to the CSV file.\n This parameter is mandatory and specifies the location of the CSV file to be read.\n - col1_name (str, optional): The name of the column used for grouping the data.\n If not provided, defaults to 'column1'. This column should exist in the CSV file.\n - col2_name (str, optional): The name of the column for which the mean is calculated for each group.\n If not provided, defaults to 'column2'. This column should exist in the CSV file and contain numerical data.\n\n Returns:\n - matplotlib.axes.Axes: The Axes object of the generated bar plot.\n This object can be used to further customize the plot, like adding labels or changing styles.\n\n Requirements:\n - pandas\n - matplotlib\n\n Example:\n >>> ax = f_854(\"data.csv\", \"group_column\", \"value_column\")\n >>> ax.get_title()\n 'Mean of value_column Grouped by group_column'\n\n Note:\n - Ensure that the CSV file exists at the specified path and has the required columns.\n - The function does not handle missing data. Ensure that the CSV file has clean and complete data for accurate results.\n - The bar plot is customizable using matplotlib's functionality after the function returns the Axes object.\n \"\"\"", "canonical_solution": " df = pd.read_csv(csv_file_path)\n groupby_data = df.groupby(col1_name)[col2_name].mean()\n\n _, ax = plt.subplots(figsize=(10, 6))\n ax.bar(groupby_data.index, groupby_data.values)\n ax.set_title(f\"Mean of {col2_name} Grouped by {col1_name}\")\n ax.set_xlabel(col1_name)\n ax.set_ylabel(f\"Mean of {col2_name}\")\n\n return ax", "test": "import unittest\nimport pandas as pd\nfrom unittest.mock import patch\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n def setUp(self):\n # Define mock data\n self.data = {\n \"sample_data\": pd.DataFrame(\n {\"column1\": [\"A\", \"A\", \"B\", \"B\"], \"column2\": [1, 2, 3, 4]}\n ),\n \"different_data\": pd.DataFrame(\n {\"column1\": [\"C\", \"C\", \"D\", \"D\"], \"column2\": [5, 6, 7, 8]}\n ),\n \"missing_values\": pd.DataFrame(\n {\"column1\": [\"A\", \"A\", \"B\", \"B\"], \"column2\": [1, None, 3, None]}\n ),\n \"different_columns\": pd.DataFrame(\n {\"col1\": [\"E\", \"E\", \"F\", \"F\"], \"col2\": [9, 10, 11, 12]}\n ),\n \"single_group_data\": pd.DataFrame(\n {\"column1\": [\"A\", \"A\", \"A\"], \"column2\": [1, 2, 3]}\n ),\n \"non_numeric_data\": pd.DataFrame(\n {\"column1\": [\"A\", \"B\", \"C\"], \"column2\": [\"x\", \"y\", \"z\"]}\n ),\n }\n @patch(\"pandas.read_csv\")\n def test_bar_plot(self, mock_read_csv):\n \"\"\"Test standard bar plot generation with sample data.\"\"\"\n mock_read_csv.return_value = self.data[\"sample_data\"]\n ax = f_854(\"any_path.csv\", \"column1\", \"column2\")\n self.check_plot(ax, \"sample_data\", \"column1\", \"column2\")\n @patch(\"pandas.read_csv\")\n def test_different_data(self, mock_read_csv):\n \"\"\"Test bar plot with different data set.\"\"\"\n mock_read_csv.return_value = self.data[\"different_data\"]\n ax = f_854(\"any_path.csv\", \"column1\", \"column2\")\n self.check_plot(ax, \"different_data\", \"column1\", \"column2\")\n @patch(\"pandas.read_csv\")\n def test_missing_values(self, mock_read_csv):\n \"\"\"Test bar plot with missing values in data.\"\"\"\n mock_read_csv.return_value = self.data[\"missing_values\"]\n ax = f_854(\"any_path.csv\", \"column1\", \"column2\")\n self.check_plot(ax, \"missing_values\", \"column1\", \"column2\")\n @patch(\"pandas.read_csv\")\n def test_different_column_names(self, mock_read_csv):\n \"\"\"Test bar plot with different column names.\"\"\"\n mock_read_csv.return_value = self.data[\"different_columns\"]\n ax = f_854(\"any_path.csv\", \"col1\", \"col2\")\n self.check_plot(ax, \"different_columns\", \"col1\", \"col2\")\n @patch(\"pandas.read_csv\")\n def test_single_group_data(self, mock_read_csv):\n \"\"\"Test bar plot with data containing only a single group.\"\"\"\n mock_read_csv.return_value = self.data[\"single_group_data\"]\n ax = f_854(\"any_path.csv\", \"column1\", \"column2\")\n self.check_plot(ax, \"single_group_data\", \"column1\", \"column2\")\n @patch(\"pandas.read_csv\")\n def test_non_numeric_aggregation_column(self, mock_read_csv):\n \"\"\"Test bar plot with non-numeric data in the aggregation column.\"\"\"\n mock_read_csv.return_value = self.data[\"non_numeric_data\"]\n with self.assertRaises(TypeError):\n f_854(\"any_path.csv\", \"column1\", \"column2\")\n def check_plot(self, ax, data_key, col1, col2):\n \"\"\"Check the generated bar plot.\"\"\"\n # Use the correct DataFrame for expected calculations\n df = self.data[data_key]\n # Common assertions for checking plot\n expected_title = f\"Mean of {col2} Grouped by {col1}\"\n self.assertEqual(ax.get_title(), expected_title)\n self.assertEqual(ax.get_xlabel(), col1)\n self.assertEqual(ax.get_ylabel(), f\"Mean of {col2}\")\n # Check the bars in the plot\n bars = ax.patches\n bar_heights = [bar.get_height() for bar in bars]\n expected_means = df.groupby(col1)[col2].mean().values\n self.assertListEqual(bar_heights, list(expected_means))\n def tearDown(self):\n plt.close()", "apis": ["pandas.read_csv", "matplotlib.pyplot.subplots"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Reads data from a CSV file and generates a bar plot based on grouped mean values.", "The DataFrame is grouped by the column named 'col1_name',", "and the mean for each group is calculated for the column 'col2_name'.", "A bar plot is created using matplotlib. Each bar in the plot represents a group,", "and its height corresponds to the mean value of 'col2_name' for that group.", "The plot is then configured with a title and axis labels:", "- The title is set as \"Mean of [col2_name] Grouped by [col1_name]\".", "This format dynamically inserts the names of the columns being analyzed into the title.", "- The xlabel (label for the x-axis) is set to the name of the column used for grouping (col1_name).", "- The ylabel (label for the y-axis) is set as \"Mean of [col2_name]\",", "indicating that the y-axis represents the mean values of the specified column."], "note": ["Ensure that the CSV file exists at the specified path and has the required columns.", "The function does not handle missing data. Ensure that the CSV file has clean and complete data for accurate results.", "The bar plot is customizable using matplotlib's functionality after the function returns the Axes object."], "params": ["csv_file_path (str): The file path to the CSV file.", "This parameter is mandatory and specifies the location of the CSV file to be read.", "col1_name (str, optional): The name of the column used for grouping the data.", "If not provided, defaults to 'column1'. This column should exist in the CSV file.", "col2_name (str, optional): The name of the column for which the mean is calculated for each group.", "If not provided, defaults to 'column2'. This column should exist in the CSV file and contain numerical data."], "returns": ["matplotlib.axes.Axes: The Axes object of the generated bar plot.", "This object can be used to further customize the plot, like adding labels or changing styles."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> ax = f_854(\"data.csv\", \"group_column\", \"value_column\")", ">>> ax.get_title()", "'Mean of value_column Grouped by group_column'"]}} +{"task_id": "f_420", "prompt": "import numpy as np\nfrom collections import Counter\nfrom scipy.stats import norm\nimport matplotlib.pyplot as plt\n\n\ndef f_420(df, bins=4):\n \"\"\"\n Identify and count duplicate values in a DataFrame's 'value' column.\n This function also plots a histogram for all values in the 'value' column\n and overlays a normal distribution curve on the histogram.\n\n Parameters:\n df (pd.DataFrame): DataFrame containing a numeric 'value' column. If empty,\n the function will return empty Counter and an empty plot.\n bins (int, optional): Number of bins for the histogram. Defaults to 4.\n\n Returns:\n tuple: A tuple containing:\n - Counter: A Counter object with the count of each duplicate value.\n - Axes: A matplotlib.axes.Axes object that represents the plot\n of the histogram with the 'value' column data. If applicable,\n a normal distribution curve fitted to the data is overlaid. The\n histogram's bars are green with 60% opacity, and the normal\n distribution curve is black with a linewidth of 2. The plot is\n titled \"Distribution\", with \"Value\" as the x-axis label and\n \"Frequency\" as the y-axis label.\n\n Requirements:\n - collections.Counter\n - numpy\n - scipy.stats.norm\n - matplotlib.pyplot\n\n Example:\n >>> df = pd.DataFrame({'value': [1, 2, 2, 3, 3, 4, 3, 2, 1, 4, 4, 4, 2, 2, 3, 1, 1, 1, 3, 2]})\n >>> counter, ax = f_420(df)\n >>> ax\n \n >>> counter\n Counter({2: 6, 1: 5, 3: 5, 4: 4})\n \"\"\"", "canonical_solution": " # Filter only duplicate values\n duplicates = df[df[\"value\"].duplicated(keep=False)]\n duplicates_counter = Counter(duplicates[\"value\"])\n\n # Check if data is empty or constant\n if df.empty or df[\"value\"].nunique() == 1:\n mu, std = None, None\n else:\n mu, std = norm.fit(df[\"value\"])\n\n fig, ax = plt.subplots()\n ax.hist(df[\"value\"], bins=bins, density=True, alpha=0.6, color=\"g\")\n if mu is not None and std is not None:\n xmin, xmax = plt.xlim()\n x = np.linspace(xmin, xmax, 100)\n p = norm.pdf(x, mu, std)\n ax.plot(x, p, \"k\", linewidth=2)\n ax.set_xlabel(\"Value\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(\"Distribution\")\n\n return duplicates_counter, ax", "test": "import unittest\nimport pandas as pd\nfrom collections import Counter\nimport matplotlib\nclass TestCases(unittest.TestCase):\n def _check_plot(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Distribution\")\n self.assertEqual(ax.get_xlabel(), \"Value\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_1(self):\n # Basic case - no repeated value\n df = pd.DataFrame({\"value\": [1, 2, 3, 4, 5]})\n counter, ax = f_420(df)\n self._check_plot(ax)\n self.assertEqual(counter, Counter())\n def test_case_2(self):\n # Basic case - all repeated values\n df = pd.DataFrame({\"value\": [1, 1, 1, 1, 1]})\n counter, ax = f_420(df)\n self._check_plot(ax)\n self.assertEqual(counter, Counter({1: 5}))\n def test_case_3(self):\n # Basic case - test empty\n df = pd.DataFrame({\"value\": []})\n counter, ax = f_420(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(counter, Counter())\n def test_case_4(self):\n # Basic case with more diverse data distribution\n df = pd.DataFrame({\"value\": [5, 5, 5, 5, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4]})\n counter, ax = f_420(df)\n self._check_plot(ax)\n self.assertEqual(counter, Counter({5: 4, 1: 4, 2: 3, 3: 2}))\n def test_case_5(self):\n # Test bins explicitly\n np.random.seed(0)\n df = pd.DataFrame({\"value\": np.random.rand(100)})\n for bins in [2, 10, 20]:\n _, ax = f_420(df, bins=bins)\n self.assertEqual(\n len(ax.patches), bins, f\"Expected {bins} bins in the histogram.\"\n )\n def test_case_6(self):\n # Test handling non-numeric value\n df = pd.DataFrame({\"value\": [\"a\", \"b\", \"c\", \"a\", \"b\", \"b\"]})\n with self.assertRaises(TypeError):\n f_420(df)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["scipy.stats.norm.fit", "collections.Counter", "matplotlib.pyplot.xlim", "scipy.stats.norm.pdf", "numpy.linspace", "matplotlib.pyplot.subplots"], "libs": ["collections", "numpy", "matplotlib", "scipy"], "doc": {"description": ["Identify and count duplicate values in a DataFrame's 'value' column.", "This function also plots a histogram for all values in the 'value' column", "and overlays a normal distribution curve on the histogram."], "note": [], "params": ["df (pd.DataFrame): DataFrame containing a numeric 'value' column. If empty,", "the function will return empty Counter and an empty plot.", "bins (int, optional): Number of bins for the histogram. Defaults to 4."], "returns": ["tuple: A tuple containing:", "Counter: A Counter object with the count of each duplicate value.", "Axes: A matplotlib.axes.Axes object that represents the plot", "of the histogram with the 'value' column data. If applicable,", "a normal distribution curve fitted to the data is overlaid. The", "histogram's bars are green with 60% opacity, and the normal", "distribution curve is black with a linewidth of 2. The plot is", "titled \"Distribution\", with \"Value\" as the x-axis label and", "\"Frequency\" as the y-axis label."], "reqs": ["collections.Counter", "numpy", "scipy.stats.norm", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({'value': [1, 2, 2, 3, 3, 4, 3, 2, 1, 4, 4, 4, 2, 2, 3, 1, 1, 1, 3, 2]})", ">>> counter, ax = f_420(df)", ">>> ax", "", ">>> counter", "Counter({2: 6, 1: 5, 3: 5, 4: 4})"]}} +{"task_id": "f_788", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LinearRegression\n\ndef f_788(start_date='2016-01-01', periods=13, freq='WOM-2FRI', sales_data=None):\n \"\"\"\n Generate a sales series and forecast future sales using linear regression.\n \n Functionality:\n - This function generates a time series of sales data starting from a specified date, then uses linear regression to forecast future sales based on the provided or generated sales data.\n \n Input:\n - start_date (str): The start date for the sales data in YYYY-MM-DD format. Default is '2016-01-01'.\n - periods (int): The number of periods for which the sales data is available. Default is 13.\n - freq (str): The frequency of the sales data, e.g., 'WOM-2FRI' for the second Friday of each month. Default is 'WOM-2FRI'.\n - sales_data (array-like, optional): An array containing actual sales data. If not provided, random data will be generated.\n \n Output:\n - A numpy array containing the forecasted future sales for the same number of periods as the input data.\n \n Examples:\n >>> np.random.seed(42) # For consistent random data generation in examples\n >>> f_788('2016-01-01', 13, 'WOM-2FRI')\n array([313.65384615, 318.56043956, 323.46703297, 328.37362637,\n 333.28021978, 338.18681319, 343.09340659, 348. ,\n 352.90659341, 357.81318681, 362.71978022, 367.62637363,\n 372.53296703])\n >>> f_788('2020-01-01', 5, 'M', [200, 300, 400, 500, 600])\n array([238.9, 226. , 213.1, 200.2, 187.3])\n \"\"\"", "canonical_solution": " sales_data = np.random.randint(low=100, high=500, size=periods)\n \n date_range = pd.date_range(start=start_date, freq=freq, periods=periods)\n sales_df = pd.DataFrame({'Date': date_range, 'Sales': sales_data})\n \n X = np.arange(len(sales_df)).reshape(-1, 1)\n y = sales_df['Sales'].values\n \n model = LinearRegression()\n model.fit(X, y)\n \n future_dates = np.arange(len(sales_df), 2*len(sales_df)).reshape(-1, 1)\n future_sales = model.predict(future_dates)\n \n return future_sales", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_with_default_parameters(self):\n np.random.seed(42) # For consistent test setup\n forecasted_sales = f_788()\n self.assertIsInstance(forecasted_sales, np.ndarray)\n self.assertEqual(forecasted_sales.shape[0], 13)\n \n def test_with_custom_parameters(self):\n np.random.seed(0) # For consistent test setup\n forecasted_sales = f_788('2020-01-01', 10, 'M', [200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100])\n self.assertIsInstance(forecasted_sales, np.ndarray)\n self.assertEqual(forecasted_sales.shape[0], 10)\n \n def test_with_random_sales_data(self):\n np.random.seed(55) # For consistent test setup\n forecasted_sales = f_788(periods=5)\n self.assertIsInstance(forecasted_sales, np.ndarray)\n self.assertEqual(forecasted_sales.shape[0], 5)\n \n def test_forecasted_values_increasing(self):\n np.random.seed(66) # For consistent test setup\n sales_data = [100, 150, 200, 250, 300]\n forecasted_sales = f_788('2021-01-01', 5, 'M', sales_data)\n self.assertFalse(all(forecasted_sales[i] <= forecasted_sales[i + 1] for i in range(len(forecasted_sales) - 1)))\n \n def test_with_specific_sales_data(self):\n np.random.seed(42) # For consistent test setup\n sales_data = [100, 200, 300, 400, 500]\n forecasted_sales = f_788('2022-01-01', 5, 'Q', sales_data)\n self.assertIsInstance(forecasted_sales, np.ndarray)\n self.assertEqual(forecasted_sales.shape[0], 5)", "apis": ["numpy.random.randint", "numpy.arange", "pandas.DataFrame", "numpy.random", "pandas.date_range", "sklearn.linear_model.LinearRegression"], "libs": ["pandas", "numpy", "sklearn"], "doc": {"description": ["Generate a sales series and forecast future sales using linear regression.", "Functionality:", "- This function generates a time series of sales data starting from a specified date, then uses linear regression to forecast future sales based on the provided or generated sales data.", "Input:", "- start_date (str): The start date for the sales data in YYYY-MM-DD format. Default is '2016-01-01'.", "- periods (int): The number of periods for which the sales data is available. Default is 13.", "- freq (str): The frequency of the sales data, e.g., 'WOM-2FRI' for the second Friday of each month. Default is 'WOM-2FRI'.", "- sales_data (array-like, optional): An array containing actual sales data. If not provided, random data will be generated.", "Output:", "- A numpy array containing the forecasted future sales for the same number of periods as the input data."], "note": [], "params": [], "returns": [], "reqs": [], "raises": [], "example": ["Examples:", ">>> np.random.seed(42) # For consistent random data generation in examples", ">>> f_788('2016-01-01', 13, 'WOM-2FRI')", "array([313.65384615, 318.56043956, 323.46703297, 328.37362637,", "333.28021978, 338.18681319, 343.09340659, 348. ,", "352.90659341, 357.81318681, 362.71978022, 367.62637363,", "372.53296703])", ">>> f_788('2020-01-01', 5, 'M', [200, 300, 400, 500, 600])", "array([238.9, 226. , 213.1, 200.2, 187.3])"]}} +{"task_id": "f_387", "prompt": "import pandas as pd\nfrom datetime import datetime, timedelta\nimport random\n\n\ndef f_387(epoch_milliseconds, seed=0):\n \"\"\"\n Generate user activity logs from a given epoch time to the current time.\n\n This function iterates from the starting epoch time to the current system\n time, incrementally increasing the time by a random number of seconds (an\n integer in [1, 10]) between each log entry. Each log entry records a user\n performing an activity at a specific time.\n\n Parameters:\n - epoch_milliseconds (int): Starting epoch time in milliseconds. Must be in\n the past compared to current system time.\n - seed (int): random seed for reproducibility. Defaults to 0.\n\n Returns:\n - pd.DataFrame: A DataFrame containing logs of user activities, with columns:\n - 'User': User names, randomly chosen from a predefined list of users,\n ['user1', 'user2', 'user3', 'user4', 'user5'].\n - 'Activity': Activities performed by the users, randomly chosen from a\n predefined list of activities, ['login', 'logout', 'browse',\n 'search', 'purchase'].\n - 'Time': The timestamp of when the activity occurred, incrementally\n increasing from the starting epoch time to the current time.\n\n Requirements:\n - pandas\n - datetime.datetime.fromtimestamp\n - datetime.timedelta\n - random\n\n Example:\n >>> log = f_387(1615168051807)\n >>> type(log)\n \n >>> log.iloc[0]\n User user4\n Activity search\n Time 2021-03-08 12:47:31.807000\n Name: 0, dtype: object\n \"\"\"", "canonical_solution": " random.seed(seed)\n\n USERS = [\"user1\", \"user2\", \"user3\", \"user4\", \"user5\"]\n ACTIVITIES = [\"login\", \"logout\", \"browse\", \"search\", \"purchase\"]\n\n start_time = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n end_time = datetime.now()\n if start_time >= end_time:\n raise ValueError(\"Start time must be before current system time\")\n\n logs = []\n current_time = start_time\n while current_time <= end_time:\n user = random.choice(USERS)\n activity = random.choice(ACTIVITIES)\n logs.append([user, activity, current_time])\n current_time += timedelta(seconds=random.randint(1, 10))\n log_df = pd.DataFrame(logs, columns=[\"User\", \"Activity\", \"Time\"])\n return log_df", "test": "import unittest\nimport pandas as pd\nfrom datetime import datetime, timedelta\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic functionality - 1 day ago\n epoch_milliseconds = int(\n (datetime.now() - timedelta(days=1)).timestamp() * 1000\n )\n log = f_387(epoch_milliseconds)\n self.assertTrue(isinstance(log, pd.DataFrame))\n self.assertTrue(\"User\" in log.columns)\n self.assertTrue(\"Activity\" in log.columns)\n self.assertTrue(\"Time\" in log.columns)\n start_time = datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n self.assertEqual(log.iloc[0][\"Time\"], start_time)\n def test_case_2(self):\n # Test with a short time frame - 1 minutes ago\n epoch_milliseconds = int(\n (datetime.now() - timedelta(minutes=1)).timestamp() * 1000\n )\n log = f_387(epoch_milliseconds)\n self.assertTrue(len(log) > 0) # Should have at least one entry\n self.assertTrue(\n log[\"Time\"].min() >= datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n )\n def test_case_3(self):\n # Test with a specific seed\n epoch_milliseconds = int(\n (datetime.now() - timedelta(days=1)).timestamp() * 1000\n )\n seed = 42\n log = f_387(epoch_milliseconds, seed=seed)\n first_row = log.iloc[0]\n expected_user = \"user1\"\n expected_activity = \"login\"\n self.assertEqual(first_row[\"User\"], expected_user)\n self.assertEqual(first_row[\"Activity\"], expected_activity)\n def test_case_4(self):\n # Test functionality over a longer period - 1 month ago\n epoch_milliseconds = int(\n (datetime.now() - timedelta(days=30)).timestamp() * 1000\n )\n log = f_387(epoch_milliseconds)\n # Ensure that log timestamps are properly incrementing\n time_diffs = log[\"Time\"].diff().dropna()\n self.assertTrue(all(time_diffs > timedelta(seconds=0)))\n seconds_in_a_month = (\n 30 * 24 * 60 * 60\n ) # Approximate number of seconds in a month\n max_possible_entries = (\n seconds_in_a_month # Assuming a minimum of 1-second increments\n )\n min_possible_entries = (\n seconds_in_a_month // 10\n ) # Assuming a maximum of 10-second increments\n # Verify that the log has a reasonable number of entries given the time frame\n self.assertTrue(min_possible_entries <= len(log) <= max_possible_entries)\n self.assertTrue(\n log[\"Time\"].min() >= datetime.fromtimestamp(epoch_milliseconds / 1000.0)\n )\n self.assertTrue(log[\"Time\"].max() <= datetime.now())\n def test_case_5(self):\n # Test invalid start time (future)\n epoch_milliseconds = int(\n (datetime.now() + timedelta(days=1)).timestamp() * 1000\n )\n with self.assertRaises(Exception):\n f_387(epoch_milliseconds)", "apis": ["datetime.datetime.fromtimestamp", "pandas.DataFrame", "datetime.datetime.now", "random.randint", "random.seed", "random.choice", "datetime.timedelta"], "libs": ["random", "pandas", "datetime"], "doc": {"description": ["Generate user activity logs from a given epoch time to the current time.", "This function iterates from the starting epoch time to the current system", "time, incrementally increasing the time by a random number of seconds (an", "integer in [1, 10]) between each log entry. Each log entry records a user", "performing an activity at a specific time."], "note": [], "params": ["epoch_milliseconds (int): Starting epoch time in milliseconds. Must be in", "the past compared to current system time.", "seed (int): random seed for reproducibility. Defaults to 0."], "returns": ["pd.DataFrame: A DataFrame containing logs of user activities, with columns:", "'User': User names, randomly chosen from a predefined list of users,", "['user1', 'user2', 'user3', 'user4', 'user5'].", "'Activity': Activities performed by the users, randomly chosen from a", "predefined list of activities, ['login', 'logout', 'browse',", "'search', 'purchase'].", "'Time': The timestamp of when the activity occurred, incrementally", "increasing from the starting epoch time to the current time."], "reqs": ["pandas", "datetime.datetime.fromtimestamp", "datetime.timedelta", "random"], "raises": [], "example": [">>> log = f_387(1615168051807)", ">>> type(log)", "", ">>> log.iloc[0]", "User user4", "Activity search", "Time 2021-03-08 12:47:31.807000", "Name: 0, dtype: object"]}} +{"task_id": "f_750", "prompt": "import os\nimport pandas as pd\nimport re\nimport matplotlib.pyplot as plt\nimport numpy as np\n\ndef f_750(directory: str, pattern: str) -> list:\n \"\"\"\n Searches a directory for CSV files matching a given regular expression pattern,\n reads sales data from these files, and plots the sales data with month on the x-axis and sales on the y-axis.\n \n Note:\n - Each CSV file contains two columns: 'Month' and 'Sales'.\n\n Parameters:\n - directory (str): The directory path where the CSV files are located.\n - pattern (str): The regular expression pattern to match the filenames.\n\n Returns:\n - A list of matplotlib.axes._subplots.Axes objects, each representing a plot of sales data from a matched CSV file.\n\n Example usage:\n >>> axes = f_750('/path/to/data/', r'^sales_data_\\d{4}.csv')\n >>> len(axes)\n 2\n >>> axes[0].get_title()\n 'sales_data_2021.csv'\n \"\"\"", "canonical_solution": "\n plots = []\n for file in os.listdir(directory):\n if re.match(pattern, file):\n df = pd.read_csv(os.path.join(directory, file))\n ax = df.plot(x='Month', y='Sales', title=file)\n plots.append(ax)\n plt.show()\n return plots", "test": "import unittest\nimport shutil\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n # Prepare test data\n cls.directory = \"f_750_data_wenhao/\"\n cls.pattern = r\"^sales_data_\\d{4}.csv\"\n os.makedirs(cls.directory, exist_ok=True)\n data_2021 = pd.DataFrame({\n 'Month': ['January', 'February', 'March'],\n 'Sales': [100, 150, 200]\n })\n data_2022 = pd.DataFrame({\n 'Month': ['January', 'February', 'March'],\n 'Sales': [120, 130, 210]\n })\n data_2021.to_csv(cls.directory + \"sales_data_2021.csv\", index=False)\n data_2022.to_csv(cls.directory + \"sales_data_2022.csv\", index=False)\n @classmethod\n def tearDownClass(cls):\n # Clean up test data\n shutil.rmtree(cls.directory)\n def test_plots_generated(self):\n plots = f_750(self.directory, self.pattern)\n self.assertEqual(len(plots), 2, \"Should generate two plots for two CSV files\")\n def test_plot_titles(self):\n plots = f_750(self.directory, self.pattern)\n expected_titles = ['sales_data_2022.csv', 'sales_data_2021.csv']\n plot_titles = [plot.get_title() for plot in plots]\n self.assertEqual(plot_titles, expected_titles, \"Plot titles should match the CSV filenames\")\n def test_no_files_matched(self):\n plots = f_750(self.directory, r\"^no_match_\\d{4}.csv\")\n self.assertEqual(len(plots), 0, \"Should return an empty list if no files match the pattern\")\n def test_invalid_directory(self):\n with self.assertRaises(FileNotFoundError):\n f_750(\"/invalid/directory/\", self.pattern)\n def test_plot_data_integrity(self):\n plots = f_750(self.directory, self.pattern)\n # Read the CSV files again to get expected data\n expected_data = []\n for file in os.listdir(self.directory):\n if re.match(self.pattern, file):\n df = pd.read_csv(os.path.join(self.directory, file))\n expected_data.append(df['Sales'].to_list())\n for plot, expected_sales in zip(plots, expected_data):\n lines = plot.get_lines()\n for line in lines:\n y_data = line.get_ydata()\n # Use np.isclose for floating point comparison, if necessary\n self.assertTrue(any(np.array_equal(y_data, expected) for expected in expected_data), \"Plotted data should match the CSV file content\")", "apis": ["pandas.read_csv", "re.match", "matplotlib.pyplot.show", "os.listdir", "os.path", "os.path.join"], "libs": ["re", "pandas", "matplotlib", "os"], "doc": {"description": ["Searches a directory for CSV files matching a given regular expression pattern,", "reads sales data from these files, and plots the sales data with month on the x-axis and sales on the y-axis.", "Example usage:", ">>> axes = f_750('/path/to/data/', r'^sales_data_\\d{4}.csv')", ">>> len(axes)", "2", ">>> axes[0].get_title()", "'sales_data_2021.csv'"], "note": ["Each CSV file contains two columns: 'Month' and 'Sales'."], "params": ["directory (str): The directory path where the CSV files are located.", "pattern (str): The regular expression pattern to match the filenames."], "returns": ["A list of matplotlib.axes._subplots.Axes objects, each representing a plot of sales data from a matched CSV file."], "reqs": [], "raises": [], "example": []}} +{"task_id": "f_572", "prompt": "import numpy as np\nimport math\nimport random\nfrom random import uniform\n\n\ndef f_572(radius, num_points):\n \"\"\"\n Create a tuple with a list of random points within a circle of a given radius.\n \n Parameters:\n - radius (int): The radius of the circle.\n - num_points (int): The number of points to be generated.\n\n Returns:\n - out (list): A list of points within a circle.\n\n Requirements:\n - numpy\n - math\n - random\n\n Example:\n >>> random.seed(42)\n >>> f_572(1, 3)\n [(-0.10124546928297637, -0.12149119380571095), (-0.07399370924760951, 0.46662154808860146), (-0.06984148700093858, -0.8196472742078809)]\n \"\"\"", "canonical_solution": " out = []\n \n for _ in range(num_points):\n theta = uniform(0, 2*np.pi)\n r = radius * math.sqrt(uniform(0, 1))\n x = r * math.cos(theta)\n y = r * math.sin(theta)\n out.append((x, y))\n \n return out", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n points = f_572(1, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 1)\n def test_case_2(self):\n points = f_572(2, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 4)\n def test_case_3(self):\n points = f_572(3, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 9)\n def test_case_4(self):\n points = f_572(4, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 16)\n def test_case_5(self):\n points = f_572(5, 3)\n for x, y in points:\n self.assertTrue(x**2 + y**2 <= 25)", "apis": ["math.sqrt", "numpy.pi", "math.sin", "math.cos", "random.uniform"], "libs": ["random", "numpy", "math"], "doc": {"description": ["Create a tuple with a list of random points within a circle of a given radius."], "note": [], "params": ["radius (int): The radius of the circle.", "num_points (int): The number of points to be generated."], "returns": ["out (list): A list of points within a circle."], "reqs": ["numpy", "math", "random"], "raises": [], "example": [">>> random.seed(42)", ">>> f_572(1, 3)", "[(-0.10124546928297637, -0.12149119380571095), (-0.07399370924760951, 0.46662154808860146), (-0.06984148700093858, -0.8196472742078809)]"]}} +{"task_id": "f_410", "prompt": "import collections\nimport matplotlib.pyplot as plt\n\n\ndef f_410(data):\n \"\"\"\n Combine a list of dictionaries with possibly differing keys (student names) into a single dictionary,\n calculate the average score for each student, and return a bar chart of average student scores with\n student on the x-axis and average score on the y-axis.\n\n This function handles data with varying dictionary lengths and missing keys by averaging available scores,\n ignoring None. If there is any negative score, the function raises ValueError.\n Bar colors can be: 'red', 'yellow', 'green', 'blue', 'purple'.\n\n Parameters:\n data (list): A list of dictionaries. The keys are student names and the values are scores.\n\n Returns:\n ax (matplotlib.axes._axes.Axes or None): A bar chart showing the 'Average Student Scores', with\n 'Student' on the x-axis and 'Average Score' on the y-axis.\n If data is empty, return None.\n\n Requirements:\n - collections\n - matplotlib.pyplot\n\n Example:\n >>> data = [{'John': 5, 'Jane': 10, 'Joe': 7},\\\n {'John': 6, 'Jane': 8, 'Joe': 10},\\\n {'John': 5, 'Jane': 9, 'Joe': 8},\\\n {'John': 7, 'Jane': 10, 'Joe': 9}]\n >>> ax = f_410(data)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0, 0, 'Jane'), Text(1, 0, 'Joe'), Text(2, 0, 'John')]\n \"\"\"", "canonical_solution": " if not data:\n return None\n\n combined_dict = {}\n for d in data:\n for k, v in d.items():\n if v is None:\n continue\n elif v < 0:\n raise ValueError(\"Scores must be non-negative.\")\n if k in combined_dict:\n combined_dict[k].append(v)\n else:\n combined_dict[k] = [v]\n\n avg_scores = {k: sum(v) / len(v) for k, v in combined_dict.items()}\n avg_scores = collections.OrderedDict(sorted(avg_scores.items()))\n labels, values = zip(*avg_scores.items())\n\n fig, ax = plt.subplots()\n ax.bar(labels, values, color=[\"red\", \"yellow\", \"green\", \"blue\", \"purple\"])\n ax.set_title(\"Average Student Scores\")\n ax.set_xlabel(\"Student\")\n ax.set_ylabel(\"Average Score\")\n\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def _check_plot_structure(self, ax):\n # Assert type of returned object\n self.assertIsInstance(ax, plt.Axes)\n # Check plot title, x-label, y-label\n self.assertEqual(ax.get_title(), \"Average Student Scores\")\n self.assertEqual(ax.get_xlabel(), \"Student\")\n self.assertEqual(ax.get_ylabel(), \"Average Score\")\n def test_case_1(self):\n # Test multiple users multiple data points\n data = [\n {\"John\": 5, \"Jane\": 10, \"Joe\": 7},\n {\"John\": 6, \"Jane\": 8, \"Joe\": 10},\n {\"John\": 5, \"Jane\": 9, \"Joe\": 8},\n {\"John\": 7, \"Jane\": 10, \"Joe\": 9},\n ]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights (average scores)\n for bar, label in zip(ax.containers[0], [\"Jane\", \"Joe\", \"John\"]):\n if label == \"Jane\":\n self.assertEqual(bar.get_height(), 9.25)\n elif label == \"Joe\":\n self.assertEqual(bar.get_height(), 8.5)\n elif label == \"John\":\n self.assertEqual(bar.get_height(), 5.75)\n def test_case_2(self):\n # Test same user multiple data points\n data = [{\"John\": 5}, {\"John\": 6}, {\"John\": 7}, {\"John\": 8}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights (average scores)\n for bar, _ in zip(ax.containers[0], [\"John\"]):\n self.assertEqual(bar.get_height(), 6.5)\n def test_case_3(self):\n # Test with multiple students and one data point each\n data = [{\"John\": 10}, {\"Jane\": 15}, {\"Joe\": 20}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights match the single data point for each student\n expected_scores = {\"Jane\": 15, \"Joe\": 20, \"John\": 10}\n for bar, label in zip(ax.containers[0], expected_scores.keys()):\n self.assertEqual(bar.get_height(), expected_scores[label])\n def test_case_4(self):\n # Test multiple users multiple data points different lengths\n data = [{\"Jane\": 10, \"Joe\": 7}, {\"Joe\": 10}, {\"Jane\": 9, \"John\": 8}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights (average scores)\n for bar, label in zip(ax.containers[0], [\"Jane\", \"Joe\"]):\n if label == \"Jane\":\n self.assertAlmostEqual(bar.get_height(), 9.5, places=2)\n elif label == \"Joe\":\n self.assertAlmostEqual(bar.get_height(), 8.5, places=2)\n def test_case_5(self):\n # Test handling None\n data = [\n {\"Jane\": 10, \"Joe\": 7},\n {\"Joe\": 10, \"Jane\": None, \"John\": None},\n {\"Jane\": 9, \"John\": 8},\n {\"Joe\": None},\n ]\n ax = f_410(data)\n self._check_plot_structure(ax) # Results should be same as test_case_4\n for bar, label in zip(ax.containers[0], [\"Jane\", \"Joe\"]):\n if label == \"Jane\":\n self.assertAlmostEqual(bar.get_height(), 9.5, places=2)\n elif label == \"Joe\":\n self.assertAlmostEqual(bar.get_height(), 8.5, places=2)\n def test_case_6(self):\n # Test only one data point with multiple students\n data = [{\"John\": 5, \"Jane\": 10}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights (average scores)\n for bar, label in zip(ax.containers[0], [\"Jane\", \"John\"]):\n if label == \"Jane\":\n self.assertEqual(bar.get_height(), 10)\n elif label == \"John\":\n self.assertEqual(bar.get_height(), 5)\n def test_case_7(self):\n # Test empty input\n data = []\n ax = f_410(data)\n self.assertIsNone(ax)\n def test_case_8(self):\n # Test with data containing negative scores\n data = [{\"John\": -2, \"Jane\": 3}, {\"John\": -4, \"Jane\": 5}]\n with self.assertRaises(ValueError):\n f_410(data)\n def test_case_9(self):\n # Test with a larger dataset\n data = [{\"John\": i} for i in range(1000)]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar height for the large dataset (average should be close to 499.5)\n self.assertAlmostEqual(\n next(iter(ax.containers[0])).get_height(), 499.5, places=2\n )\n def test_case_10(self):\n # Test with some negative scores mixed with positive ones\n data = [{\"John\": 5, \"Jane\": -1}, {\"John\": -2, \"Jane\": 2}]\n with self.assertRaises(ValueError):\n f_410(data)\n def test_case_11(self):\n # Test with all scores as 0\n data = [{\"John\": 0, \"Jane\": 0}, {\"John\": 0, \"Jane\": 0}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check bar heights are 0 for all students\n for bar, label in zip(ax.containers[0], [\"Jane\", \"John\"]):\n self.assertEqual(bar.get_height(), 0)\n def test_case_12(self):\n # Test with some dictionaries being empty\n data = [{\"John\": 5}, {}, {\"Jane\": 10}]\n ax = f_410(data)\n self._check_plot_structure(ax)\n # Check that the empty dictionary does not affect the output\n expected_scores = {\"Jane\": 10, \"John\": 5}\n for bar, label in zip(ax.containers[0], expected_scores.keys()):\n self.assertEqual(bar.get_height(), expected_scores[label])\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "collections.OrderedDict"], "libs": ["collections", "matplotlib"], "doc": {"description": ["Combine a list of dictionaries with possibly differing keys (student names) into a single dictionary,", "calculate the average score for each student, and return a bar chart of average student scores with", "student on the x-axis and average score on the y-axis.", "This function handles data with varying dictionary lengths and missing keys by averaging available scores,", "ignoring None. If there is any negative score, the function raises ValueError.", "Bar colors can be: 'red', 'yellow', 'green', 'blue', 'purple'."], "note": [], "params": ["data (list): A list of dictionaries. The keys are student names and the values are scores."], "returns": ["ax (matplotlib.axes._axes.Axes or None): A bar chart showing the 'Average Student Scores', with", "'Student' on the x-axis and 'Average Score' on the y-axis.", "If data is empty, return None."], "reqs": ["collections", "matplotlib.pyplot"], "raises": [], "example": [">>> data = [{'John': 5, 'Jane': 10, 'Joe': 7},\\", "{'John': 6, 'Jane': 8, 'Joe': 10},\\", "{'John': 5, 'Jane': 9, 'Joe': 8},\\", "{'John': 7, 'Jane': 10, 'Joe': 9}]", ">>> ax = f_410(data)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0, 0, 'Jane'), Text(1, 0, 'Joe'), Text(2, 0, 'John')]"]}} {"task_id": "f_880", "prompt": "import numpy as np\nfrom scipy.stats import ttest_ind\n\n\ndef f_880(s1, s2):\n \"\"\"\n Performs a comparative analysis of two pandas Series by visualizing their distributions using a histogram\n and assessing statistical differences through a two-sample t-test.\n\n Note:\n - The labels on the histogram bars correspond to the Series names if available.\n\n Parameters:\n - s1 (pd.Series): The first pandas Series for comparison.\n - s2 (pd.Series): The second pandas Series for comparison.\n \n Returns:\n - matplotlib.axes.Axes: An Axes object representing the overlaid histograms of both Series.\n - float: The t-statistic from the t-test, indicating the degree of difference between the two Series' means.\n - float: The two-tailed p-value from the t-test, suggesting the probability of\n observing the data if the null hypothesis (no difference in means) is true.\n\n Requirements:\n - matplotlib\n - scipy\n\n Example:\n >>> s1 = pd.Series(np.random.normal(0, 1, 1000))\n >>> s2 = pd.Series(np.random.normal(1, 1, 1000))\n >>> ax, t_stat, p_value = f_880(s1, s2)\n >>> plt.show() # Display the plot in a non-interactive environment\n \"\"\"", "canonical_solution": "\n # Plotting directly on Series objects\n ax = s1.plot(kind=\"hist\", bins=30, alpha=0.5, label=s1.name or \"Series 1\", legend=True)\n s2.plot(kind=\"hist\", bins=30, alpha=0.5, label=s2.name or \"Series 2\", ax=ax, legend=True)\n \n # Performing the two-sample t-test\n t_stat, p_value = ttest_ind(s1, s2, equal_var=False) # Assuming unequal variances by default\n \n # No change needed for plt.show(), already correctly used\n plt.show() # Shows the plot\n \n return ax, t_stat, p_value", "test": "import pandas as pd\nimport numpy as np\nimport unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_880.\"\"\"\n def test_significantly_different_means(self):\n \"\"\"Scenario: Two distributions with significantly different means.\n Expected: The t-test should detect a significant difference (p-value < 0.05).\n \"\"\"\n np.random.seed(42)\n s1 = pd.Series(np.random.normal(0, 1, 1000))\n np.random.seed(42)\n s2 = pd.Series(np.random.normal(5, 1, 1000))\n _, t_stat, p_value = f_880(s1, s2)\n self.assertLess(p_value, 0.05)\n self.assertAlmostEqual(t_stat, -114.1764492547248)\n def test_same_distribution(self):\n \"\"\"Scenario: Two distributions sampled from the same distribution (i.e., same mean and variance).\n Expected: The t-test should not detect a significant difference (p-value >= 0.05).\n \"\"\"\n np.random.seed(123)\n s1 = pd.Series(np.random.normal(0, 1, 1000))\n np.random.seed(123)\n s2 = pd.Series(np.random.normal(0, 1, 1000))\n _, t_stat, p_value = f_880(s1, s2)\n self.assertGreaterEqual(p_value, 0.05)\n self.assertAlmostEqual(t_stat, 0.0)\n def test_same_mean_different_variance(self):\n \"\"\"Scenario: Two distributions with the same mean but different variances.\n Expected: The t-test might or might not detect a significant difference.\n \"\"\"\n np.random.seed(0)\n s1 = pd.Series(np.random.normal(0, 1, 1000))\n np.random.seed(0)\n s2 = pd.Series(np.random.normal(0, 3, 1000))\n _, t_stat, p_value = f_880(s1, s2)\n self.assertTrue(0 <= p_value <= 1)\n self.assertAlmostEqual(t_stat, 0.9165664411422174)\n def test_histogram_labels(self):\n \"\"\"Scenario: Testing if the histogram labels match the series names.\n Expected: The labels on the histogram should match the series names.\n \"\"\"\n np.random.seed(0)\n s1 = pd.Series(np.random.normal(0, 1, 1000), name=\"Dist1\")\n np.random.seed(0)\n s2 = pd.Series(np.random.normal(1, 1, 1000), name=\"Dist2\")\n ax, _, _ = f_880(s1, s2)\n legend_texts = [text.get_text() for text in ax.legend().get_texts()]\n self.assertIn(\"Dist1\", legend_texts)\n self.assertIn(\"Dist2\", legend_texts)\n def test_distributions_with_outliers(self):\n \"\"\"Scenario: One distribution with outliers and another without.\n Expected: The t-test should detect a significant difference if outliers are far from the mean.\n \"\"\"\n np.random.seed(42)\n s1 = pd.Series(\n np.concatenate([np.random.normal(0, 1, 990), np.array([50] * 10)])\n )\n np.random.seed(42)\n s2 = pd.Series(np.random.normal(0, 1, 1000))\n _, t_stat, p_value = f_880(s1, s2)\n self.assertLess(p_value, 0.05)\n self.assertAlmostEqual(t_stat, 3.0719987200209986)\n def tearDown(self):\n plt.clf()", "apis": ["scipy.stats.ttest_ind"], "libs": ["scipy"], "doc": {"description": ["Performs a comparative analysis of two pandas Series by visualizing their distributions using a histogram", "and assessing statistical differences through a two-sample t-test."], "note": ["The labels on the histogram bars correspond to the Series names if available."], "params": ["s1 (pd.Series): The first pandas Series for comparison.", "s2 (pd.Series): The second pandas Series for comparison."], "returns": ["matplotlib.axes.Axes: An Axes object representing the overlaid histograms of both Series.", "float: The t-statistic from the t-test, indicating the degree of difference between the two Series' means.", "float: The two-tailed p-value from the t-test, suggesting the probability of", "observing the data if the null hypothesis (no difference in means) is true."], "reqs": ["matplotlib", "scipy"], "raises": [], "example": [">>> s1 = pd.Series(np.random.normal(0, 1, 1000))", ">>> s2 = pd.Series(np.random.normal(1, 1, 1000))", ">>> ax, t_stat, p_value = f_880(s1, s2)", ">>> plt.show() # Display the plot in a non-interactive environment"]}} -{"task_id": "f_381", "prompt": "import re\nimport pandas as pd\n\n\ndef f_381(df: pd.DataFrame, column_name: str, pattern: str) -> pd.DataFrame:\n \"\"\"\n Reverse the order of words in a specific column of a pandas DataFrame where the words\n match a user-specified regular expression pattern, using a nested helper function.\n Words are considered to be whitespace-separated strings. This function maintains the\n original order of non-matching words.\n\n Parameters:\n - df (pd.DataFrame): The pandas DataFrame.\n - column_name (str): The name of the column to be modified.\n - pattern (str), the regular expression pattern to match words against.\n\n Returns:\n - pd.DataFrame: A new pandas DataFrame with the specified column's words reordered\n if they match the pattern, maintaining the original order of words that do not match,\n and returning a copy of the unaltered DataFrame if the pattern is empty.\n\n Requirements:\n - pandas\n - re\n\n Example:\n >>> df = pd.DataFrame({'A': ['apple orange', 'red yellow green'], 'B': [1, 2]})\n >>> pattern = r'\\b(?:apple|yellow)\\b'\n >>> reversed_df = f_381(df, 'A', pattern)\n >>> reversed_df\n A B\n 0 apple orange 1\n 1 red yellow green 2\n >>> df = pd.DataFrame({'A': ['yellow car red', 'green apple yellow'], 'B': [3, 4]})\n >>> pattern = r'\\b(?:car|apple|yellow)\\b'\n >>> reversed_df = f_381(df, 'A', pattern)\n >>> reversed_df\n A B\n 0 yellow car red 3\n 1 green apple yellow 4\n \"\"\"", "canonical_solution": "\n def reverse_matched_words(text):\n words = text.split()\n matched_words = [word for word in words if re.search(pattern, word)][::-1]\n new_words = [\n matched_words.pop(0) if re.search(pattern, word) else word for word in words\n ]\n return \" \".join(new_words)\n\n new_df = df.copy()\n if not pattern:\n return new_df\n new_df[column_name] = new_df[column_name].apply(reverse_matched_words)\n return new_df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Example df to test for error handling\n self.df = pd.DataFrame(\n {\"A\": [\"blue car red\", \"green apple yellow\"], \"B\": [3, 4]}\n )\n def test_case_1(self):\n # Test case where no words match the pattern\n df = pd.DataFrame({\"Text\": [\"apple orange\", \"blue red\"], \"Number\": [1, 2]})\n pattern = r\"\\b(?:banana|green)\\b\"\n expected = df.copy()\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_2(self):\n # Test case where all words in a column match the pattern\n df = pd.DataFrame({\"Text\": [\"apple banana\", \"banana apple\"], \"Number\": [1, 2]})\n pattern = r\"\\b(?:apple|banana)\\b\"\n expected = pd.DataFrame(\n {\"Text\": [\"banana apple\", \"apple banana\"], \"Number\": [1, 2]}\n )\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_3(self):\n # Test case with a mix of matching and non-matching words\n df = pd.DataFrame(\n {\"Text\": [\"apple orange banana\", \"blue apple green\"], \"Number\": [1, 2]}\n )\n pattern = r\"\\b(?:apple|banana)\\b\"\n expected = pd.DataFrame(\n {\"Text\": [\"banana orange apple\", \"blue apple green\"], \"Number\": [1, 2]}\n )\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_4(self):\n # Test case where the column contains an empty string\n df = pd.DataFrame({\"Text\": [\"\", \"apple banana\"], \"Number\": [1, 2]})\n pattern = r\"\\b(?:apple|banana)\\b\"\n expected = pd.DataFrame({\"Text\": [\"\", \"banana apple\"], \"Number\": [1, 2]})\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_5(self):\n # Test case where the pattern is an empty string (matches nothing)\n df = pd.DataFrame({\"Text\": [\"apple orange\", \"banana apple\"], \"Number\": [1, 2]})\n pattern = \"\"\n expected = df.copy()\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_6(self):\n # Test the function with a column name that does not exist in the DataFrame\n with self.assertRaises(KeyError):\n f_381(self.df, \"NonexistentColumn\", r\"\\b(?:car|apple|yellow)\\b\")\n def test_case_7(self):\n # Test the function with a non-string column name\n with self.assertRaises(KeyError):\n f_381(self.df, 123, r\"\\b(?:car|apple|yellow)\\b\")\n def test_case_8(self):\n # Test the function with an invalid regular expression pattern\n with self.assertRaises(re.error):\n f_381(self.df, \"A\", r\"\\b(?:car|apple|yellow\")", "apis": ["re.search", "pandas.DataFrame"], "libs": ["pandas", "re"], "doc": {"description": ["Reverse the order of words in a specific column of a pandas DataFrame where the words", "match a user-specified regular expression pattern, using a nested helper function.", "Words are considered to be whitespace-separated strings. This function maintains the", "original order of non-matching words."], "note": [], "params": ["df (pd.DataFrame): The pandas DataFrame.", "column_name (str): The name of the column to be modified.", "pattern (str), the regular expression pattern to match words against."], "returns": ["pd.DataFrame: A new pandas DataFrame with the specified column's words reordered", "if they match the pattern, maintaining the original order of words that do not match,", "and returning a copy of the unaltered DataFrame if the pattern is empty."], "reqs": ["pandas", "re"], "raises": [], "example": [">>> df = pd.DataFrame({'A': ['apple orange', 'red yellow green'], 'B': [1, 2]})", ">>> pattern = r'\\b(?:apple|yellow)\\b'", ">>> reversed_df = f_381(df, 'A', pattern)", ">>> reversed_df", "A B", "0 apple orange 1", "1 red yellow green 2", ">>> df = pd.DataFrame({'A': ['yellow car red', 'green apple yellow'], 'B': [3, 4]})", ">>> pattern = r'\\b(?:car|apple|yellow)\\b'", ">>> reversed_df = f_381(df, 'A', pattern)", ">>> reversed_df", "A B", "0 yellow car red 3", "1 green apple yellow 4"]}} -{"task_id": "f_783", "prompt": "import re\nfrom nltk import word_tokenize\nfrom collections import Counter\n\ndef f_783(input_str):\n \"\"\"\n Remove all special characters, punctuation marks and spaces from a string called \"input _ str\" using regex and then count the frequency of each word.\n\n Parameters:\n input_str (str): The input string.\n\n Returns:\n dict: A dictionary with the frequency of each word.\n\n Requirements:\n - re\n - nltk.word_tokenize\n - collections.Counter\n\n Example:\n >>> f_783('Special $#! characters spaces 888323')\n Counter({'Special': 1, 'characters': 1, 'spaces': 1, '888323': 1})\n \"\"\"", "canonical_solution": " cleaned_str = re.sub('[^A-Za-z0-9 ]+', '', input_str)\n words = word_tokenize(cleaned_str)\n freq_dict = Counter(words)\n\n return freq_dict", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n result = f_783('Special $#! characters spaces 888323')\n expected = {'Special': 1, 'characters': 1, 'spaces': 1, '888323': 1}\n self.assertEqual(result, expected)\n def test_case_2(self):\n result = f_783('Hello hello world')\n expected = {'Hello': 1, 'hello': 1, 'world': 1}\n self.assertEqual(result, expected)\n def test_case_3(self):\n result = f_783('')\n expected = {}\n self.assertEqual(result, expected)\n def test_case_4(self):\n result = f_783('123 123 456')\n expected = {'123': 2, '456': 1}\n self.assertEqual(result, expected)\n def test_case_5(self):\n result = f_783('Hello123 #$! 123')\n expected = {'Hello123': 1, '123': 1}\n self.assertEqual(result, expected)", "apis": ["nltk.word_tokenize", "re.sub", "collections.Counter"], "libs": ["collections", "re", "nltk"], "doc": {"description": ["Remove all special characters, punctuation marks and spaces from a string called \"input _ str\" using regex and then count the frequency of each word."], "note": [], "params": ["input_str (str): The input string."], "returns": ["dict: A dictionary with the frequency of each word."], "reqs": ["re", "nltk.word_tokenize", "collections.Counter"], "raises": [], "example": [">>> f_783('Special $#! characters spaces 888323')", "Counter({'Special': 1, 'characters': 1, 'spaces': 1, '888323': 1})"]}} -{"task_id": "f_367", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_367(file_path=\"data.csv\", columns=[\"A\", \"B\", \"C\"]):\n \"\"\"\n Read a CSV file into a Pandas DataFrame, convert numeric values into floats,and draw a line chart of data in the specified columns.\n In addition, compute the cube-root of the data.\n \n Parameters:\n - file_path (str): Path to the CSV file. Default is 'data.csv'.\n - columns (list of str): List of column names from the data to plot.\n Default is ['A', 'B', 'C'].\n\n Returns:\n tuple: A tuple containing:\n - DataFrame: A pandas DataFrame of the data in the CSV file.\n - Axes: A matplotlib Axes object showing the plotted data.\n - Series: A pandas Series containing the cube-root of the data.\n \n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> df, ax, croot = f_367('path_to_csv.csv', ['Column1', 'Column2', 'Column3'])\n >>> df\n Column1 Column2 Column3\n 0 1.0 2.0 3.0\n 1 4.0 5.0 6.0\n >>> ax\n \n >>> croot\n 0 1.0 \n \"\"\"", "canonical_solution": " df = pd.read_csv(file_path, dtype=float)\n ax = df[columns].plot()\n croot = np.cbrt(df[columns])\n return df, ax, croot", "test": "import unittest\nimport tempfile\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.test_dir = tempfile.TemporaryDirectory()\n self.temp_files = {}\n # Data setups for different scenarios\n self.data_sets = {\n \"int\": pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6], \"C\": [7, 8, 9]}),\n \"varied\": pd.DataFrame(\n {\n \"IntColumn\": [1, 2, 3],\n \"FloatColumn\": [1.1, 2.2, 3.3],\n \"StringColumn\": [\"4\", \"5\", \"6\"],\n }\n ),\n \"varied_invalid\": pd.DataFrame(\n {\n \"IntColumn\": [1, 2, 3],\n \"FloatColumn\": [1.1, 2.2, 3.3],\n \"StringColumn\": [\"a\", \"b\", \"c\"],\n }\n ),\n }\n # Write data sets to temporary files\n for key, df in self.data_sets.items():\n temp_file_path = os.path.join(self.test_dir.name, f\"{key}.csv\")\n df.to_csv(temp_file_path, index=False, header=True)\n self.temp_files[key] = temp_file_path\n def tearDown(self):\n self.test_dir.cleanup()\n plt.close(\"all\")\n def test_case_1(self):\n file_path = self.temp_files[\"int\"]\n df, ax, croot = f_367(file_path=file_path, columns=[\"A\", \"B\", \"C\"])\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(df.columns.tolist(), [\"A\", \"B\", \"C\"])\n self.assertTrue((df[\"A\"].tolist() == [1, 2, 3]))\n self.assertTrue((df[\"B\"].tolist() == [4, 5, 6]))\n self.assertTrue((df[\"C\"].tolist() == [7, 8, 9]))\n self.assertEqual(croot.to_dict(), {'A': {0: 1.0, 1: 1.2599210498948734, 2: 1.4422495703074083}, 'B': {0: 1.5874010519681996, 1: 1.7099759466766968, 2: 1.8171205928321394}, 'C': {0: 1.9129311827723894, 1: 2.0, 2: 2.080083823051904}})\n \n def test_case_2(self):\n file_path = self.temp_files[\"int\"]\n with self.assertRaises(KeyError):\n f_367(file_path=file_path, columns=[\"A\", \"B\", \"Nonexistent\"])\n def test_case_3(self):\n file_path = self.temp_files[\"varied\"]\n df, ax, croot = f_367(\n file_path=file_path, columns=[\"IntColumn\", \"FloatColumn\", \"StringColumn\"]\n )\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(df[\"IntColumn\"].equals(pd.Series([1.0, 2.0, 3.0])))\n self.assertTrue(df[\"FloatColumn\"].equals(pd.Series([1.1, 2.2, 3.3])))\n self.assertTrue(df[\"StringColumn\"].equals(pd.Series([4.0, 5.0, 6.0])))\n self.assertEqual(croot.to_dict(), {'IntColumn': {0: 1.0, 1: 1.2599210498948734, 2: 1.4422495703074083}, 'FloatColumn': {0: 1.0322801154563672, 1: 1.300591446851387, 2: 1.4888055529538275}, 'StringColumn': {0: 1.5874010519681996, 1: 1.7099759466766968, 2: 1.8171205928321394}})\n \n def test_case_4(self):\n file_path = self.temp_files[\"varied_invalid\"]\n with self.assertRaises(Exception):\n f_367(file_path=file_path, columns=[\"StringColumn\"])\n def test_case_5(self):\n with self.assertRaises(FileNotFoundError):\n f_367(file_path=\"nonexistent_file.csv\")", "apis": ["pandas.read_csv", "numpy.cbrt"], "libs": ["numpy", "pandas"], "doc": {"description": ["Read a CSV file into a Pandas DataFrame, convert numeric values into floats,and draw a line chart of data in the specified columns.", "In addition, compute the cube-root of the data."], "note": [], "params": ["file_path (str): Path to the CSV file. Default is 'data.csv'.", "columns (list of str): List of column names from the data to plot.", "Default is ['A', 'B', 'C']."], "returns": ["tuple: A tuple containing:", "DataFrame: A pandas DataFrame of the data in the CSV file.", "Axes: A matplotlib Axes object showing the plotted data.", "Series: A pandas Series containing the cube-root of the data."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> df, ax, croot = f_367('path_to_csv.csv', ['Column1', 'Column2', 'Column3'])", ">>> df", "Column1 Column2 Column3", "0 1.0 2.0 3.0", "1 4.0 5.0 6.0", ">>> ax", "", ">>> croot", "0 1.0"]}} -{"task_id": "f_865", "prompt": "import pandas as pd\nimport os\nfrom datetime import datetime\nfrom pandas.errors import EmptyDataError\n\n\ndef f_865(csv_file_path, column_name, date_format=\"%Y-%m-%d\"):\n \"\"\"\n Reads a CSV file and processes its date-related data. The function performs several key tasks\n such as checking for the file's existence, validating the presence of a specified date column,\n converting date values to datetime objects, filtering rows based on the current date, and sorting\n the resulting data.\n\n The function handles special cases, like an empty CSV file, by returning an empty DataFrame and\n raises exceptions for specific error scenarios like missing files or columns.\n\n Parameters:\n - csv_file_path (str): The path to the CSV file. FileNotFoundError is raised if the path is invalid.\n - column_name (str): The name of the column containing date values. ValueError is raised if\n this column is missing in the CSV file.\n - date_format (str, optional): The format of the date values in the specified column. Defaults to '%Y-%m-%d'.\n\n Returns:\n - pandas\n - os\n - datetime.datetime\n - pandas.errors.EmptyDataError\n \n Raises:\n - FileNotFoundError: If the specified CSV file is not found at the given path.\n - ValueError: If the specified column is not present in the CSV file.\n\n Requirements:\n - pandas\n - os\n - datetime\n\n Example:\n >>> f_865('path/to/csvfile.csv', 'DateColumn')\n Date Value\n 0 2023-12-10 100\n 1 2023-12-11 150\n \"\"\"", "canonical_solution": " if not os.path.isfile(csv_file_path):\n raise FileNotFoundError(f\"The file {csv_file_path} does not exist.\")\n\n try:\n df = pd.read_csv(csv_file_path)\n except EmptyDataError:\n return pd.DataFrame()\n\n if column_name not in df.columns:\n raise ValueError(f\"The column {column_name} is not found in the file.\")\n\n df[column_name] = pd.to_datetime(df[column_name], format=date_format)\n current_date = datetime.now().date()\n df = df[df[column_name].dt.date >= current_date]\n df = df.sort_values(by=column_name)\n\n return df", "test": "import unittest\nimport pandas as pd\nfrom io import StringIO\nfrom datetime import datetime, timedelta\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_865 function.\"\"\"\n def setUp(self):\n # Set future dates for the test data\n future_date_1 = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n future_date_2 = (datetime.now() + timedelta(days=2)).strftime(\"%Y-%m-%d\")\n future_date_3 = (datetime.now() + timedelta(days=3)).strftime(\"%Y-%m-%d\")\n # Create mock data with the correct column names and future dates\n self.valid_csv_data = f\"\"\"Date,Value\\n{future_date_1},100\\n{future_date_2},150\\n{future_date_3},50\"\"\"\n self.valid_csv_path = \"valid.csv\"\n with open(self.valid_csv_path, \"w\", encoding=\"utf-8\") as f:\n f.write(self.valid_csv_data)\n # Set today's date as a string for comparison in tests\n self.today_str = datetime.now().strftime(\"%Y-%m-%d\")\n def tearDown(self):\n # Remove created file\n if os.path.exists(self.valid_csv_path):\n os.remove(self.valid_csv_path)\n def test_valid_input(self):\n \"\"\"Test case for valid input CSV file and column name.\"\"\"\n df = f_865(self.valid_csv_path, \"Date\")\n self.assertFalse(df.empty)\n self.assertTrue(all(df[\"Date\"] >= pd.to_datetime(self.today_str)))\n def test_file_not_found(self):\n \"\"\"Test case for non-existing CSV file.\"\"\"\n with self.assertRaises(FileNotFoundError):\n f_865(\"non_existing.csv\", \"Date\")\n def test_column_not_found(self):\n \"\"\"Test case for CSV file without the specified column.\"\"\"\n invalid_csv_data = StringIO(\n \"\"\"\n NotDate,Value\n 2023-12-10,100\n 2023-12-11,150\n \"\"\"\n )\n invalid_csv_path = \"invalid.csv\"\n pd.read_csv(invalid_csv_data).to_csv(invalid_csv_path, index=False)\n with self.assertRaises(ValueError):\n f_865(invalid_csv_path, \"Date\")\n os.remove(invalid_csv_path)\n def test_empty_file(self):\n \"\"\"Test case for an empty CSV file.\"\"\"\n empty_csv_path = \"empty.csv\"\n with open(empty_csv_path, \"w\", encoding=\"utf-8\") as f:\n pass # Create an empty file\n df = f_865(empty_csv_path, \"Date\")\n self.assertTrue(df.empty)\n os.remove(empty_csv_path)\n def test_no_future_dates(self):\n \"\"\"Test case where all dates in the CSV file are in the past.\"\"\"\n past_csv_data = \"\"\"Date,Value\\n2020-01-01,100\\n2020-01-02,150\"\"\"\n past_csv_path = \"past.csv\"\n with open(past_csv_path, \"w\", encoding=\"utf-8\") as f:\n f.write(past_csv_data)\n df = f_865(past_csv_path, \"Date\")\n self.assertTrue(df.empty)\n os.remove(past_csv_path)", "apis": ["pandas.read_csv", "os.path", "datetime.datetime.now", "pandas.to_datetime", "pandas.DataFrame", "os.path.isfile"], "libs": ["pandas", "datetime", "os"], "doc": {"description": ["Reads a CSV file and processes its date-related data. The function performs several key tasks", "such as checking for the file's existence, validating the presence of a specified date column,", "converting date values to datetime objects, filtering rows based on the current date, and sorting", "the resulting data.", "The function handles special cases, like an empty CSV file, by returning an empty DataFrame and", "raises exceptions for specific error scenarios like missing files or columns."], "note": [], "params": ["csv_file_path (str): The path to the CSV file. FileNotFoundError is raised if the path is invalid.", "column_name (str): The name of the column containing date values. ValueError is raised if", "this column is missing in the CSV file.", "date_format (str, optional): The format of the date values in the specified column. Defaults to '%Y-%m-%d'."], "returns": ["pandas", "os", "datetime.datetime", "pandas.errors.EmptyDataError"], "reqs": ["pandas", "os", "datetime"], "raises": ["FileNotFoundError: If the specified CSV file is not found at the given path.", "ValueError: If the specified column is not present in the CSV file."], "example": [">>> f_865('path/to/csvfile.csv', 'DateColumn')", "Date Value", "0 2023-12-10 100", "1 2023-12-11 150"]}} +{"task_id": "f_381", "prompt": "import re\nimport pandas as pd\n\n\ndef f_381(df: pd.DataFrame, column_name: str, pattern: str) -> pd.DataFrame:\n \"\"\"\n Reverse the order of words in a specific column of a pandas DataFrame where the words\n match a user-specified regular expression pattern, using a nested helper function.\n Words are considered to be whitespace-separated strings. This function maintains the\n original order of non-matching words.\n\n Parameters:\n - df (pd.DataFrame): The pandas DataFrame.\n - column_name (str): The name of the column to be modified.\n - pattern (str), the regular expression pattern to match words against.\n\n Returns:\n - pd.DataFrame: A new pandas DataFrame with the specified column's words reordered\n if they match the pattern, maintaining the original order of words that do not match,\n and returning a copy of the unaltered DataFrame if the pattern is empty.\n\n Requirements:\n - pandas\n - re\n\n Example:\n >>> df = pd.DataFrame({'A': ['apple orange', 'red yellow green'], 'B': [1, 2]})\n >>> pattern = r'\\b(?:apple|yellow)\\b'\n >>> reversed_df = f_381(df, 'A', pattern)\n >>> reversed_df\n A B\n 0 apple orange 1\n 1 red yellow green 2\n >>> df = pd.DataFrame({'A': ['yellow car red', 'green apple yellow'], 'B': [3, 4]})\n >>> pattern = r'\\b(?:car|apple|yellow)\\b'\n >>> reversed_df = f_381(df, 'A', pattern)\n >>> reversed_df\n A B\n 0 yellow car red 3\n 1 green apple yellow 4\n \"\"\"", "canonical_solution": "\n def reverse_matched_words(text):\n words = text.split()\n matched_words = [word for word in words if re.search(pattern, word)][::-1]\n new_words = [\n matched_words.pop(0) if re.search(pattern, word) else word for word in words\n ]\n return \" \".join(new_words)\n\n new_df = df.copy()\n if not pattern:\n return new_df\n new_df[column_name] = new_df[column_name].apply(reverse_matched_words)\n return new_df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Example df to test for error handling\n self.df = pd.DataFrame(\n {\"A\": [\"blue car red\", \"green apple yellow\"], \"B\": [3, 4]}\n )\n def test_case_1(self):\n # Test case where no words match the pattern\n df = pd.DataFrame({\"Text\": [\"apple orange\", \"blue red\"], \"Number\": [1, 2]})\n pattern = r\"\\b(?:banana|green)\\b\"\n expected = df.copy()\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_2(self):\n # Test case where all words in a column match the pattern\n df = pd.DataFrame({\"Text\": [\"apple banana\", \"banana apple\"], \"Number\": [1, 2]})\n pattern = r\"\\b(?:apple|banana)\\b\"\n expected = pd.DataFrame(\n {\"Text\": [\"banana apple\", \"apple banana\"], \"Number\": [1, 2]}\n )\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_3(self):\n # Test case with a mix of matching and non-matching words\n df = pd.DataFrame(\n {\"Text\": [\"apple orange banana\", \"blue apple green\"], \"Number\": [1, 2]}\n )\n pattern = r\"\\b(?:apple|banana)\\b\"\n expected = pd.DataFrame(\n {\"Text\": [\"banana orange apple\", \"blue apple green\"], \"Number\": [1, 2]}\n )\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_4(self):\n # Test case where the column contains an empty string\n df = pd.DataFrame({\"Text\": [\"\", \"apple banana\"], \"Number\": [1, 2]})\n pattern = r\"\\b(?:apple|banana)\\b\"\n expected = pd.DataFrame({\"Text\": [\"\", \"banana apple\"], \"Number\": [1, 2]})\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_5(self):\n # Test case where the pattern is an empty string (matches nothing)\n df = pd.DataFrame({\"Text\": [\"apple orange\", \"banana apple\"], \"Number\": [1, 2]})\n pattern = \"\"\n expected = df.copy()\n result = f_381(df, \"Text\", pattern)\n pd.testing.assert_frame_equal(expected, result)\n def test_case_6(self):\n # Test the function with a column name that does not exist in the DataFrame\n with self.assertRaises(KeyError):\n f_381(self.df, \"NonexistentColumn\", r\"\\b(?:car|apple|yellow)\\b\")\n def test_case_7(self):\n # Test the function with a non-string column name\n with self.assertRaises(KeyError):\n f_381(self.df, 123, r\"\\b(?:car|apple|yellow)\\b\")\n def test_case_8(self):\n # Test the function with an invalid regular expression pattern\n with self.assertRaises(re.error):\n f_381(self.df, \"A\", r\"\\b(?:car|apple|yellow\")", "apis": ["pandas.DataFrame", "re.search"], "libs": ["re", "pandas"], "doc": {"description": ["Reverse the order of words in a specific column of a pandas DataFrame where the words", "match a user-specified regular expression pattern, using a nested helper function.", "Words are considered to be whitespace-separated strings. This function maintains the", "original order of non-matching words."], "note": [], "params": ["df (pd.DataFrame): The pandas DataFrame.", "column_name (str): The name of the column to be modified.", "pattern (str), the regular expression pattern to match words against."], "returns": ["pd.DataFrame: A new pandas DataFrame with the specified column's words reordered", "if they match the pattern, maintaining the original order of words that do not match,", "and returning a copy of the unaltered DataFrame if the pattern is empty."], "reqs": ["pandas", "re"], "raises": [], "example": [">>> df = pd.DataFrame({'A': ['apple orange', 'red yellow green'], 'B': [1, 2]})", ">>> pattern = r'\\b(?:apple|yellow)\\b'", ">>> reversed_df = f_381(df, 'A', pattern)", ">>> reversed_df", "A B", "0 apple orange 1", "1 red yellow green 2", ">>> df = pd.DataFrame({'A': ['yellow car red', 'green apple yellow'], 'B': [3, 4]})", ">>> pattern = r'\\b(?:car|apple|yellow)\\b'", ">>> reversed_df = f_381(df, 'A', pattern)", ">>> reversed_df", "A B", "0 yellow car red 3", "1 green apple yellow 4"]}} +{"task_id": "f_783", "prompt": "import re\nfrom nltk import word_tokenize\nfrom collections import Counter\n\ndef f_783(input_str):\n \"\"\"\n Remove all special characters, punctuation marks and spaces from a string called \"input _ str\" using regex and then count the frequency of each word.\n\n Parameters:\n input_str (str): The input string.\n\n Returns:\n dict: A dictionary with the frequency of each word.\n\n Requirements:\n - re\n - nltk.word_tokenize\n - collections.Counter\n\n Example:\n >>> f_783('Special $#! characters spaces 888323')\n Counter({'Special': 1, 'characters': 1, 'spaces': 1, '888323': 1})\n \"\"\"", "canonical_solution": " cleaned_str = re.sub('[^A-Za-z0-9 ]+', '', input_str)\n words = word_tokenize(cleaned_str)\n freq_dict = Counter(words)\n\n return freq_dict", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n result = f_783('Special $#! characters spaces 888323')\n expected = {'Special': 1, 'characters': 1, 'spaces': 1, '888323': 1}\n self.assertEqual(result, expected)\n def test_case_2(self):\n result = f_783('Hello hello world')\n expected = {'Hello': 1, 'hello': 1, 'world': 1}\n self.assertEqual(result, expected)\n def test_case_3(self):\n result = f_783('')\n expected = {}\n self.assertEqual(result, expected)\n def test_case_4(self):\n result = f_783('123 123 456')\n expected = {'123': 2, '456': 1}\n self.assertEqual(result, expected)\n def test_case_5(self):\n result = f_783('Hello123 #$! 123')\n expected = {'Hello123': 1, '123': 1}\n self.assertEqual(result, expected)", "apis": ["re.sub", "collections.Counter", "nltk.word_tokenize"], "libs": ["re", "collections", "nltk"], "doc": {"description": ["Remove all special characters, punctuation marks and spaces from a string called \"input _ str\" using regex and then count the frequency of each word."], "note": [], "params": ["input_str (str): The input string."], "returns": ["dict: A dictionary with the frequency of each word."], "reqs": ["re", "nltk.word_tokenize", "collections.Counter"], "raises": [], "example": [">>> f_783('Special $#! characters spaces 888323')", "Counter({'Special': 1, 'characters': 1, 'spaces': 1, '888323': 1})"]}} +{"task_id": "f_367", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_367(file_path=\"data.csv\", columns=[\"A\", \"B\", \"C\"]):\n \"\"\"\n Read a CSV file into a Pandas DataFrame, convert numeric values into floats,and draw a line chart of data in the specified columns.\n In addition, compute the cube-root of the data.\n \n Parameters:\n - file_path (str): Path to the CSV file. Default is 'data.csv'.\n - columns (list of str): List of column names from the data to plot.\n Default is ['A', 'B', 'C'].\n\n Returns:\n tuple: A tuple containing:\n - DataFrame: A pandas DataFrame of the data in the CSV file.\n - Axes: A matplotlib Axes object showing the plotted data.\n - Series: A pandas Series containing the cube-root of the data.\n \n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> df, ax, croot = f_367('path_to_csv.csv', ['Column1', 'Column2', 'Column3'])\n >>> df\n Column1 Column2 Column3\n 0 1.0 2.0 3.0\n 1 4.0 5.0 6.0\n >>> ax\n \n >>> croot\n 0 1.0 \n \"\"\"", "canonical_solution": " df = pd.read_csv(file_path, dtype=float)\n ax = df[columns].plot()\n croot = np.cbrt(df[columns])\n return df, ax, croot", "test": "import unittest\nimport tempfile\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.test_dir = tempfile.TemporaryDirectory()\n self.temp_files = {}\n # Data setups for different scenarios\n self.data_sets = {\n \"int\": pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6], \"C\": [7, 8, 9]}),\n \"varied\": pd.DataFrame(\n {\n \"IntColumn\": [1, 2, 3],\n \"FloatColumn\": [1.1, 2.2, 3.3],\n \"StringColumn\": [\"4\", \"5\", \"6\"],\n }\n ),\n \"varied_invalid\": pd.DataFrame(\n {\n \"IntColumn\": [1, 2, 3],\n \"FloatColumn\": [1.1, 2.2, 3.3],\n \"StringColumn\": [\"a\", \"b\", \"c\"],\n }\n ),\n }\n # Write data sets to temporary files\n for key, df in self.data_sets.items():\n temp_file_path = os.path.join(self.test_dir.name, f\"{key}.csv\")\n df.to_csv(temp_file_path, index=False, header=True)\n self.temp_files[key] = temp_file_path\n def tearDown(self):\n self.test_dir.cleanup()\n plt.close(\"all\")\n def test_case_1(self):\n file_path = self.temp_files[\"int\"]\n df, ax, croot = f_367(file_path=file_path, columns=[\"A\", \"B\", \"C\"])\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(df.columns.tolist(), [\"A\", \"B\", \"C\"])\n self.assertTrue((df[\"A\"].tolist() == [1, 2, 3]))\n self.assertTrue((df[\"B\"].tolist() == [4, 5, 6]))\n self.assertTrue((df[\"C\"].tolist() == [7, 8, 9]))\n self.assertEqual(croot.to_dict(), {'A': {0: 1.0, 1: 1.2599210498948734, 2: 1.4422495703074083}, 'B': {0: 1.5874010519681996, 1: 1.7099759466766968, 2: 1.8171205928321394}, 'C': {0: 1.9129311827723894, 1: 2.0, 2: 2.080083823051904}})\n \n def test_case_2(self):\n file_path = self.temp_files[\"int\"]\n with self.assertRaises(KeyError):\n f_367(file_path=file_path, columns=[\"A\", \"B\", \"Nonexistent\"])\n def test_case_3(self):\n file_path = self.temp_files[\"varied\"]\n df, ax, croot = f_367(\n file_path=file_path, columns=[\"IntColumn\", \"FloatColumn\", \"StringColumn\"]\n )\n self.assertIsInstance(df, pd.DataFrame)\n self.assertIsInstance(ax, plt.Axes)\n self.assertTrue(df[\"IntColumn\"].equals(pd.Series([1.0, 2.0, 3.0])))\n self.assertTrue(df[\"FloatColumn\"].equals(pd.Series([1.1, 2.2, 3.3])))\n self.assertTrue(df[\"StringColumn\"].equals(pd.Series([4.0, 5.0, 6.0])))\n self.assertEqual(croot.to_dict(), {'IntColumn': {0: 1.0, 1: 1.2599210498948734, 2: 1.4422495703074083}, 'FloatColumn': {0: 1.0322801154563672, 1: 1.300591446851387, 2: 1.4888055529538275}, 'StringColumn': {0: 1.5874010519681996, 1: 1.7099759466766968, 2: 1.8171205928321394}})\n \n def test_case_4(self):\n file_path = self.temp_files[\"varied_invalid\"]\n with self.assertRaises(Exception):\n f_367(file_path=file_path, columns=[\"StringColumn\"])\n def test_case_5(self):\n with self.assertRaises(FileNotFoundError):\n f_367(file_path=\"nonexistent_file.csv\")", "apis": ["pandas.read_csv", "numpy.cbrt"], "libs": ["numpy", "pandas"], "doc": {"description": ["Read a CSV file into a Pandas DataFrame, convert numeric values into floats,and draw a line chart of data in the specified columns.", "In addition, compute the cube-root of the data."], "note": [], "params": ["file_path (str): Path to the CSV file. Default is 'data.csv'.", "columns (list of str): List of column names from the data to plot.", "Default is ['A', 'B', 'C']."], "returns": ["tuple: A tuple containing:", "DataFrame: A pandas DataFrame of the data in the CSV file.", "Axes: A matplotlib Axes object showing the plotted data.", "Series: A pandas Series containing the cube-root of the data."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> df, ax, croot = f_367('path_to_csv.csv', ['Column1', 'Column2', 'Column3'])", ">>> df", "Column1 Column2 Column3", "0 1.0 2.0 3.0", "1 4.0 5.0 6.0", ">>> ax", "", ">>> croot", "0 1.0"]}} +{"task_id": "f_865", "prompt": "import pandas as pd\nimport os\nfrom datetime import datetime\nfrom pandas.errors import EmptyDataError\n\n\ndef f_865(csv_file_path, column_name, date_format=\"%Y-%m-%d\"):\n \"\"\"\n Reads a CSV file and processes its date-related data. The function performs several key tasks\n such as checking for the file's existence, validating the presence of a specified date column,\n converting date values to datetime objects, filtering rows based on the current date, and sorting\n the resulting data.\n\n The function handles special cases, like an empty CSV file, by returning an empty DataFrame and\n raises exceptions for specific error scenarios like missing files or columns.\n\n Parameters:\n - csv_file_path (str): The path to the CSV file. FileNotFoundError is raised if the path is invalid.\n - column_name (str): The name of the column containing date values. ValueError is raised if\n this column is missing in the CSV file.\n - date_format (str, optional): The format of the date values in the specified column. Defaults to '%Y-%m-%d'.\n\n Returns:\n - pandas\n - os\n - datetime.datetime\n - pandas.errors.EmptyDataError\n \n Raises:\n - FileNotFoundError: If the specified CSV file is not found at the given path.\n - ValueError: If the specified column is not present in the CSV file.\n\n Requirements:\n - pandas\n - os\n - datetime\n\n Example:\n >>> f_865('path/to/csvfile.csv', 'DateColumn')\n Date Value\n 0 2023-12-10 100\n 1 2023-12-11 150\n \"\"\"", "canonical_solution": " if not os.path.isfile(csv_file_path):\n raise FileNotFoundError(f\"The file {csv_file_path} does not exist.\")\n\n try:\n df = pd.read_csv(csv_file_path)\n except EmptyDataError:\n return pd.DataFrame()\n\n if column_name not in df.columns:\n raise ValueError(f\"The column {column_name} is not found in the file.\")\n\n df[column_name] = pd.to_datetime(df[column_name], format=date_format)\n current_date = datetime.now().date()\n df = df[df[column_name].dt.date >= current_date]\n df = df.sort_values(by=column_name)\n\n return df", "test": "import unittest\nimport pandas as pd\nfrom io import StringIO\nfrom datetime import datetime, timedelta\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_865 function.\"\"\"\n def setUp(self):\n # Set future dates for the test data\n future_date_1 = (datetime.now() + timedelta(days=1)).strftime(\"%Y-%m-%d\")\n future_date_2 = (datetime.now() + timedelta(days=2)).strftime(\"%Y-%m-%d\")\n future_date_3 = (datetime.now() + timedelta(days=3)).strftime(\"%Y-%m-%d\")\n # Create mock data with the correct column names and future dates\n self.valid_csv_data = f\"\"\"Date,Value\\n{future_date_1},100\\n{future_date_2},150\\n{future_date_3},50\"\"\"\n self.valid_csv_path = \"valid.csv\"\n with open(self.valid_csv_path, \"w\", encoding=\"utf-8\") as f:\n f.write(self.valid_csv_data)\n # Set today's date as a string for comparison in tests\n self.today_str = datetime.now().strftime(\"%Y-%m-%d\")\n def tearDown(self):\n # Remove created file\n if os.path.exists(self.valid_csv_path):\n os.remove(self.valid_csv_path)\n def test_valid_input(self):\n \"\"\"Test case for valid input CSV file and column name.\"\"\"\n df = f_865(self.valid_csv_path, \"Date\")\n self.assertFalse(df.empty)\n self.assertTrue(all(df[\"Date\"] >= pd.to_datetime(self.today_str)))\n def test_file_not_found(self):\n \"\"\"Test case for non-existing CSV file.\"\"\"\n with self.assertRaises(FileNotFoundError):\n f_865(\"non_existing.csv\", \"Date\")\n def test_column_not_found(self):\n \"\"\"Test case for CSV file without the specified column.\"\"\"\n invalid_csv_data = StringIO(\n \"\"\"\n NotDate,Value\n 2023-12-10,100\n 2023-12-11,150\n \"\"\"\n )\n invalid_csv_path = \"invalid.csv\"\n pd.read_csv(invalid_csv_data).to_csv(invalid_csv_path, index=False)\n with self.assertRaises(ValueError):\n f_865(invalid_csv_path, \"Date\")\n os.remove(invalid_csv_path)\n def test_empty_file(self):\n \"\"\"Test case for an empty CSV file.\"\"\"\n empty_csv_path = \"empty.csv\"\n with open(empty_csv_path, \"w\", encoding=\"utf-8\") as f:\n pass # Create an empty file\n df = f_865(empty_csv_path, \"Date\")\n self.assertTrue(df.empty)\n os.remove(empty_csv_path)\n def test_no_future_dates(self):\n \"\"\"Test case where all dates in the CSV file are in the past.\"\"\"\n past_csv_data = \"\"\"Date,Value\\n2020-01-01,100\\n2020-01-02,150\"\"\"\n past_csv_path = \"past.csv\"\n with open(past_csv_path, \"w\", encoding=\"utf-8\") as f:\n f.write(past_csv_data)\n df = f_865(past_csv_path, \"Date\")\n self.assertTrue(df.empty)\n os.remove(past_csv_path)", "apis": ["pandas.read_csv", "pandas.to_datetime", "pandas.DataFrame", "datetime.datetime.now", "os.path", "os.path.isfile"], "libs": ["os", "pandas", "datetime"], "doc": {"description": ["Reads a CSV file and processes its date-related data. The function performs several key tasks", "such as checking for the file's existence, validating the presence of a specified date column,", "converting date values to datetime objects, filtering rows based on the current date, and sorting", "the resulting data.", "The function handles special cases, like an empty CSV file, by returning an empty DataFrame and", "raises exceptions for specific error scenarios like missing files or columns."], "note": [], "params": ["csv_file_path (str): The path to the CSV file. FileNotFoundError is raised if the path is invalid.", "column_name (str): The name of the column containing date values. ValueError is raised if", "this column is missing in the CSV file.", "date_format (str, optional): The format of the date values in the specified column. Defaults to '%Y-%m-%d'."], "returns": ["pandas", "os", "datetime.datetime", "pandas.errors.EmptyDataError"], "reqs": ["pandas", "os", "datetime"], "raises": ["FileNotFoundError: If the specified CSV file is not found at the given path.", "ValueError: If the specified column is not present in the CSV file."], "example": [">>> f_865('path/to/csvfile.csv', 'DateColumn')", "Date Value", "0 2023-12-10 100", "1 2023-12-11 150"]}} {"task_id": "f_735", "prompt": "import re\nfrom datetime import time\n\ndef f_735(logs: list):\n \"\"\"\n Analyze the given list of logs for the occurrence of errors and calculate the average time of occurrence of errors.\n \n Args:\n - logs (list): A list of log strings.\n \n Returns:\n - list: A list of times when errors occurred.\n - time: The average time of occurrence of these errors.\n \n Requirements:\n - re\n - datetime\n \n Example:\n >>> f_735(['2021-06-15 09:45:00 ERROR: Failed to connect to database',\\\n '2021-06-15 10:15:00 WARNING: Low disk space',\\\n '2021-06-15 10:35:00 INFO: Backup completed successfully'])\n ([datetime.time(9, 45)], datetime.time(9, 45))\n \"\"\"", "canonical_solution": " \n error_times = []\n total_time = 0\n\n for log in logs:\n if \"ERROR\" in log:\n time_match = re.search(r'(\\d{2}):(\\d{2}):\\d{2}', log)\n if time_match:\n hour, minute = map(int, time_match.groups())\n error_times.append(time(hour, minute))\n total_time += hour * 60 + minute\n\n if error_times:\n avg_hour = (total_time // len(error_times)) // 60\n avg_minute = (total_time // len(error_times)) % 60\n avg_time = time(avg_hour, avg_minute)\n else:\n avg_time = time(0, 0)\n\n return error_times, avg_time", "test": "import unittest\nfrom datetime import time\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n logs = ['2021-06-15 09:45:00 ERROR: Failed to connect to database',\n '2021-06-15 10:15:00 WARNING: Low disk space',\n '2021-06-15 10:35:00 INFO: Backup completed successfully']\n result = f_735(logs)\n self.assertEqual(result, ([time(9, 45)], time(9, 45)))\n def test_case_2(self):\n logs = ['2021-06-15 08:45:00 ERROR: Failed to authenticate',\n '2021-06-15 09:15:00 ERROR: Failed to connect to database',\n '2021-06-15 10:35:00 INFO: Backup completed successfully']\n result = f_735(logs)\n self.assertEqual(result, ([time(8, 45), time(9, 15)], time(9, 0)))\n def test_case_3(self):\n logs = ['2021-06-15 07:45:00 INFO: Backup started',\n '2021-06-15 08:15:00 WARNING: Low memory',\n '2021-06-15 09:35:00 INFO: Backup completed successfully']\n result = f_735(logs)\n self.assertEqual(result, ([], time(0, 0)))\n def test_case_4(self):\n logs = []\n result = f_735(logs)\n self.assertEqual(result, ([], time(0, 0)))\n def test_case_5(self):\n logs = ['2021-06-15 09:45:00 ERROR: Failed to connect to database',\n '2021-06-15 10:15:00 WARNING: Low disk space',\n '2021-06-15 11:45:00 ERROR: Failed to authenticate']\n result = f_735(logs)\n self.assertEqual(result, ([time(9, 45), time(11, 45)], time(10, 45)))\n def test_case_invalid_format(self):\n logs = ['Invalid log format',\n 'Another invalid log format',\n 'Yet another invalid log format']\n result = f_735(logs)\n self.assertEqual(result, ([], time(0, 0)))", "apis": ["re.search", "datetime.time"], "libs": ["re", "datetime"], "doc": {"description": ["Analyze the given list of logs for the occurrence of errors and calculate the average time of occurrence of errors.", "Args:", "- logs (list): A list of log strings."], "note": [], "params": [], "returns": ["list: A list of times when errors occurred.", "time: The average time of occurrence of these errors."], "reqs": ["re", "datetime"], "raises": [], "example": [">>> f_735(['2021-06-15 09:45:00 ERROR: Failed to connect to database',\\", "'2021-06-15 10:15:00 WARNING: Low disk space',\\", "'2021-06-15 10:35:00 INFO: Backup completed successfully'])", "([datetime.time(9, 45)], datetime.time(9, 45))"]}} -{"task_id": "f_844", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_844(csv_file_path: str):\n \"\"\"\n This function reads data from a CSV file, normalizes a specific column named 'column1', and then plots the normalized data.\n\n - The title is created using Python's string formatting, aligning 'Plot Title' and 'Normalized Column 1' on either side of a \n colon, each padded to 20 characters.\n - Similarly, the x-label is formatted with 'Index' and 'Normalized Value' on either side of a colon, \n each padded to 20 characters.\n - The y-label is set in the same manner, with 'Frequency' and 'Normalized Value' on either side of a colon.\n\n Parameters:\n - csv_file_path (str): Path to the CSV file. The file must contain a column named 'column1'.\n\n Returns:\n - The matplotlib.axes.Axes object with the plot of the normalized data.\n\n Requirements:\n - pandas\n - matplotlib\n\n Example:\n >>> ax = f_844('data.csv')\n >>> ax.get_title()\n 'Plot Title : Normalized Column 1'\n \"\"\"", "canonical_solution": " df = pd.read_csv(csv_file_path)\n mean = df[\"column1\"].mean()\n std = df[\"column1\"].std()\n df[\"column1_normalized\"] = (df[\"column1\"] - mean) / std\n\n # Creating a figure and axes\n _, ax = plt.subplots()\n # Plotting on the created axes\n ax.plot(df[\"column1_normalized\"])\n title = \"%*s : %*s\" % (20, \"Plot Title\", 20, \"Normalized Column 1\")\n xlabel = \"%*s : %*s\" % (20, \"Index\", 20, \"Normalized Value\")\n ylabel = \"%*s : %*s\" % (20, \"Frequency\", 20, \"Normalized Value\")\n ax.set_title(title)\n ax.set_xlabel(xlabel)\n ax.set_ylabel(ylabel)\n\n # Return the axes object for further manipulation\n return ax", "test": "import unittest\nfrom unittest.mock import patch\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_844 function.\"\"\"\n @patch(\"pandas.read_csv\")\n def test_title_format(self, mock_read_csv):\n \"\"\"Test that the function returns the correct title.\"\"\"\n # Mocking the DataFrame\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n expected_title = \" Plot Title : Normalized Column 1\"\n self.assertEqual(ax.get_title(), expected_title)\n @patch(\"pandas.read_csv\")\n def test_xlabel_format(self, mock_read_csv):\n \"\"\"Test that the function returns the correct xlabel.\"\"\"\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n expected_xlabel = \" Index : Normalized Value\"\n self.assertEqual(ax.get_xlabel(), expected_xlabel)\n @patch(\"pandas.read_csv\")\n def test_ylabel_format(self, mock_read_csv):\n \"\"\"Test that the function returns the correct ylabel.\"\"\"\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n expected_ylabel = \" Frequency : Normalized Value\"\n self.assertEqual(ax.get_ylabel(), expected_ylabel)\n @patch(\"pandas.read_csv\")\n def test_data_points_length(self, mock_read_csv):\n \"\"\"Test that the function returns the correct number of data points.\"\"\"\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n line = ax.get_lines()[0]\n self.assertEqual(len(line.get_data()[1]), 10)\n @patch(\"pandas.read_csv\")\n def test_data_points_range(self, mock_read_csv):\n \"\"\"Test that the function returns the correct data points.\"\"\"\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n line = ax.get_lines()[0]\n data_points = line.get_data()[1]\n self.assertTrue(all(-3 <= point <= 3 for point in data_points))\n def tearDown(self):\n plt.clf()", "apis": ["pandas.read_csv", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["This function reads data from a CSV file, normalizes a specific column named 'column1', and then plots the normalized data.", "- The title is created using Python's string formatting, aligning 'Plot Title' and 'Normalized Column 1' on either side of a", "colon, each padded to 20 characters.", "- Similarly, the x-label is formatted with 'Index' and 'Normalized Value' on either side of a colon,", "each padded to 20 characters.", "- The y-label is set in the same manner, with 'Frequency' and 'Normalized Value' on either side of a colon."], "note": [], "params": ["csv_file_path (str): Path to the CSV file. The file must contain a column named 'column1'."], "returns": ["The matplotlib.axes.Axes object with the plot of the normalized data."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> ax = f_844('data.csv')", ">>> ax.get_title()", "'Plot Title : Normalized Column 1'"]}} -{"task_id": "f_348", "prompt": "import numpy as np\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\n\n\ndef f_348(\n P: np.ndarray,\n T: np.ndarray,\n n_clusters: int = 3,\n random_state: int = 0,\n n_init: int = 10,\n) -> (np.ndarray, plt.Axes):\n \"\"\"\n Calculate the product of a matrix 'P' and a 3D tensor 'T', flatten the result,\n apply KMeans clustering to the flattened data, and visualize it.\n\n Parameters:\n P (numpy.ndarray): The input matrix.\n T (numpy.ndarray): The input tensor with shape (3, 3, 3).\n n_clusters (int): The number of clusters for KMeans clustering. Default is 3.\n random_state (int): The random state for KMeans clustering. Default is 0.\n n_init (int): Number of time the k-means algorithm will be run with different centroid seeds. Default is 10.\n\n Returns:\n cluster_result (numpy.ndarray): The result of KMeans clustering.\n ax (matplotlib.axes.Axes): The visualization of the KMeans clustering.\n\n Requirements:\n - numpy\n - sklearn\n - matplotlib\n\n Example:\n >>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])\n >>> T = np.random.rand(3, 3, 3)\n >>> cluster_result, ax = f_348(P, T, n_clusters=3, random_state=0, n_init=10)\n >>> type(cluster_result)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n tensor_shape = (3, 3, 3)\n if not T.shape == tensor_shape:\n raise ValueError(\"Provided tensor does not match the expected shape.\")\n\n # Using numpy for tensor product\n result = np.tensordot(P, T, axes=[1, 1]).swapaxes(0, 1)\n flattened_result = result.reshape(-1, tensor_shape[2]) # Flattening the result\n kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=n_init)\n cluster_result = kmeans.fit_predict(flattened_result)\n fig, ax = plt.subplots()\n ax.scatter(flattened_result[:, 0], flattened_result[:, 1], c=cluster_result)\n ax.set_title(\"KMeans Clustering Visualization\")\n return cluster_result, ax", "test": "import unittest\nimport numpy as np\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.random_seed = 0\n np.random.seed(self.random_seed)\n self.P = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])\n self.T = np.random.rand(3, 3, 3)\n def test_case_1(self):\n # Test with easy example\n P = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])\n T = np.array(\n [\n [[1, 0, 0], [0, 1, 1], [0, 0, 1]],\n [[1, 1, 0], [0, 1, 0], [0, 0, 1]],\n [[1, 0, 1], [0, 1, 0], [1, 0, 1]],\n ]\n )\n cluster_result, _ = f_348(P, T, n_clusters=3)\n self.assertEqual(len(np.unique(cluster_result)), 3)\n def test_case_2(self):\n # Test correct cluster centers (against manual calculated results)\n n_clusters = 3\n n_init = 10\n possible_labels = list(range(n_clusters))\n result, _ = f_348(self.P, self.T, random_state=self.random_seed, n_init=n_init)\n manual_results = KMeans(\n n_clusters=n_clusters, random_state=self.random_seed, n_init=n_init\n ).fit(\n np.tensordot(self.P, self.T, axes=[1, 1])\n .swapaxes(0, 1)\n .reshape(-1, n_clusters)\n )\n self.assertTrue((result == manual_results.labels_).all())\n self.assertEqual(result.shape, (self.P.shape[0] * n_clusters,))\n self.assertEqual(\n manual_results.cluster_centers_.shape, (n_clusters, n_clusters)\n )\n self.assertTrue((pred in possible_labels for pred in result))\n def test_case_3(self):\n # Test visualizations\n _, ax = f_348(self.P, self.T)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"KMeans Clustering Visualization\")\n num_data_points = len(ax.collections[0].get_offsets())\n self.assertEqual(num_data_points, self.P.shape[0] * 3)\n def test_case_4(self):\n # Test changing number of clusters\n for n_clusters in [1, 3, 5]:\n cluster_result, _ = f_348(self.P, self.T, n_clusters=n_clusters)\n unique_clusters = np.unique(cluster_result)\n self.assertEqual(len(unique_clusters), n_clusters)\n def test_case_5(self):\n # Function should fail with incompatible input - n_cluster and n_init\n for invalid in [-1, 0, \"invalid\"]:\n with self.assertRaises(Exception):\n f_348(self.P, self.T, n_clusters=invalid)\n def test_case_6(self):\n # Function should fail with incompatible input - shapes\n with self.assertRaises(ValueError):\n f_348(np.random.randn(2, 2), self.T)\n with self.assertRaises(ValueError):\n f_348(self.P, np.random.randn(2, 2))\n def test_case_7(self):\n # Function should fail with incompatible input - random_state\n with self.assertRaises(ValueError):\n f_348(self.P, self.T, random_state=\"invalid\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.ndarray", "sklearn.cluster.KMeans", "matplotlib.pyplot.Axes", "numpy.tensordot", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "sklearn"], "doc": {"description": ["Calculate the product of a matrix 'P' and a 3D tensor 'T', flatten the result,", "apply KMeans clustering to the flattened data, and visualize it."], "note": [], "params": ["P (numpy.ndarray): The input matrix.", "T (numpy.ndarray): The input tensor with shape (3, 3, 3).", "n_clusters (int): The number of clusters for KMeans clustering. Default is 3.", "random_state (int): The random state for KMeans clustering. Default is 0.", "n_init (int): Number of time the k-means algorithm will be run with different centroid seeds. Default is 10."], "returns": ["cluster_result (numpy.ndarray): The result of KMeans clustering.", "ax (matplotlib.axes.Axes): The visualization of the KMeans clustering."], "reqs": ["numpy", "sklearn", "matplotlib"], "raises": [], "example": [">>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])", ">>> T = np.random.rand(3, 3, 3)", ">>> cluster_result, ax = f_348(P, T, n_clusters=3, random_state=0, n_init=10)", ">>> type(cluster_result)", "", ">>> type(ax)", ""]}} -{"task_id": "f_927", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\nimport matplotlib.pyplot as plt\n\n\ndef f_927(data):\n \"\"\"\n Processes a dataset containing salary information and experience, then plots normalized salary against experience.\n The function executes the following steps:\n 1. Input Validation: Checks if the input data dictionary contains the required keys ('Salary_String' and 'Experience').\n Raises a ValueError if the necessary keys are missing.\n 2. DataFrame Conversion: Converts the input data into a pandas DataFrame for easier manipulation.\n 3. Empty Data Handling: Checks if the DataFrame is empty. If so, it returns a default Axes instance with\n labeled axes but no data plotted. This handles cases where there is no data to plot.\n 4. Salary Conversion: Converts 'Salary_String' values from comma-separated strings to floats.\n It handles potential conversion errors by catching ValueErrors and re-raising them with a custom message.\n 5. Salary Normalization: Applies Min-Max scaling to normalize the salary values. This step transforms\n the salary data into a range between 0 and 1, allowing for easier comparison and visualization.\n 6. Data Plotting: Creates a scatter plot of the normalized salary against experience using matplotlib.\n The plot's axes are labeled accordingly.\n\n Parameters:\n - data (dict): A dictionary with two keys: 'Salary_String' and 'Experience'.\n 'Salary_String' should contain salary values as comma-separated strings.\n 'Experience' should contain corresponding experience values as integers.\n\n Returns:\n - matplotlib.axes.Axes: An Axes instance with the plotted scatter plot.\n\n Raises:\n - ValueError: If the input dictionary does not contain the required keys or if data conversion from string to float fails.\n\n Requirements:\n - pandas\n - sklearn\n - matplotlib\n\n Example:\n >>> ax = f_927({'Salary_String': ['1,000', '2,000', '3,000'], 'Experience': [1, 2, 3]})\n >>> print(ax.get_title())\n Normalized Salary vs Experience\n \"\"\"", "canonical_solution": " # Validate input data\n if not all(key in data for key in [\"Salary_String\", \"Experience\"]):\n raise ValueError(\n \"Input data must contain 'Salary_String' and 'Experience' keys.\"\n )\n\n # Convert data to DataFrame\n df = pd.DataFrame(data)\n\n # Check if the data is empty\n if df.empty:\n # Handle empty data case (e.g., return a default Axes instance or raise an error)\n _, ax = plt.subplots()\n ax.set_title(\"Normalized Salary vs Experience\")\n ax.set_xlabel(\"Experience\")\n ax.set_ylabel(\"Normalized Salary\")\n return ax\n\n # Convert Salary_String to float and handle potential conversion errors\n try:\n df[\"Salary_Float\"] = df[\"Salary_String\"].str.replace(\",\", \"\").astype(float)\n except ValueError:\n raise ValueError(\"Error converting Salary_String to float.\")\n\n # Normalize the Salary_Float values\n scaler = MinMaxScaler()\n df[\"Normalized_Salary\"] = scaler.fit_transform(df[[\"Salary_Float\"]])\n\n # Plot the data\n _, ax = plt.subplots()\n ax.scatter(df[\"Experience\"], df[\"Normalized_Salary\"])\n ax.set_title(\"Normalized Salary vs Experience\")\n ax.set_xlabel(\"Experience\")\n ax.set_ylabel(\"Normalized Salary\")\n\n return ax", "test": "import unittest\nimport pandas as pd\nfrom matplotlib.axes import Axes\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_927.\"\"\"\n def test_valid_data(self):\n \"\"\"Test with valid data.\"\"\"\n data = {\"Salary_String\": [\"1,000\", \"2,000\", \"3,000\"], \"Experience\": [1, 2, 3]}\n result = f_927(data)\n self.assertIsInstance(result, Axes)\n def test_missing_key(self):\n \"\"\"Test with missing key in input dictionary.\"\"\"\n data = {\"Salary_String\": [\"1,000\", \"2,000\", \"3,000\"]}\n with self.assertRaises(ValueError):\n f_927(data)\n def test_empty_data(self):\n \"\"\"Test with empty data.\"\"\"\n data = {\"Salary_String\": [], \"Experience\": []}\n result = f_927(data)\n self.assertIsInstance(result, Axes)\n def test_invalid_salary_format(self):\n \"\"\"Test with invalid salary format.\"\"\"\n data = {\n \"Salary_String\": [\"1.000\", \"2,000\", \"Three Thousand\"],\n \"Experience\": [1, 2, 3],\n }\n with self.assertRaises(ValueError):\n f_927(data)\n def test_mismatched_lengths(self):\n \"\"\"Test with mismatched lengths of salary and experience arrays.\"\"\"\n data = {\"Salary_String\": [\"1,000\", \"2,000\"], \"Experience\": [1, 2, 3]}\n with self.assertRaises(ValueError):\n f_927(data)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["sklearn.preprocessing.MinMaxScaler", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["Processes a dataset containing salary information and experience, then plots normalized salary against experience.", "The function executes the following steps:", "1. Input Validation: Checks if the input data dictionary contains the required keys ('Salary_String' and 'Experience').", "Raises a ValueError if the necessary keys are missing.", "2. DataFrame Conversion: Converts the input data into a pandas DataFrame for easier manipulation.", "3. Empty Data Handling: Checks if the DataFrame is empty. If so, it returns a default Axes instance with", "labeled axes but no data plotted. This handles cases where there is no data to plot.", "4. Salary Conversion: Converts 'Salary_String' values from comma-separated strings to floats.", "It handles potential conversion errors by catching ValueErrors and re-raising them with a custom message.", "5. Salary Normalization: Applies Min-Max scaling to normalize the salary values. This step transforms", "the salary data into a range between 0 and 1, allowing for easier comparison and visualization.", "6. Data Plotting: Creates a scatter plot of the normalized salary against experience using matplotlib.", "The plot's axes are labeled accordingly."], "note": [], "params": ["data (dict): A dictionary with two keys: 'Salary_String' and 'Experience'.", "'Salary_String' should contain salary values as comma-separated strings.", "'Experience' should contain corresponding experience values as integers."], "returns": ["matplotlib.axes.Axes: An Axes instance with the plotted scatter plot."], "reqs": ["pandas", "sklearn", "matplotlib"], "raises": ["ValueError: If the input dictionary does not contain the required keys or if data conversion from string to float fails."], "example": [">>> ax = f_927({'Salary_String': ['1,000', '2,000', '3,000'], 'Experience': [1, 2, 3]})", ">>> print(ax.get_title())", "Normalized Salary vs Experience"]}} -{"task_id": "f_800", "prompt": "import string\nimport re\n\n\ndef f_800(text: str) -> tuple:\n \"\"\"\n Counts the number of words, characters, and unique characters in a given text.\n\n Parameters:\n - text (str): The input text to be analyzed.\n\n Returns:\n - tuple: A tuple containing three integers: the number of words,\n the number of characters,\n the number of unique characters.\n\n Requirements:\n - string\n - re\n\n Note:\n - This function considers whitespace-separated substrings as words.\n - When counting characters, this function excludes whitespace and special\n characters (i.e. string.punctuation).\n\n Example:\n >>> f_800('Hello, world!')\n (2, 10, 7)\n >>> f_800('Python is awesome! ')\n (3, 15, 12)\n \"\"\"", "canonical_solution": " words = text.split()\n chars = re.sub(\"\\s\", \"\", re.sub(f\"[{string.punctuation}]\", \"\", text))\n\n return len(words), len(chars), len(set(chars))", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test simple text without any punctuation.\n result = f_800(\"Hello world\")\n self.assertEqual(result, (2, 10, 7))\n def test_case_2(self):\n # Test simple text that includes punctuation.\n result = f_800(\"Hello, world!\")\n self.assertEqual(result, (2, 10, 7))\n def test_case_3(self):\n # Test single word and no punctuation.\n result = f_800(\"Hello\")\n self.assertEqual(result, (1, 5, 4))\n def test_case_4(self):\n # Test single word that includes punctuation.\n result = f_800(\"Hello!\")\n self.assertEqual(result, (1, 5, 4))\n def test_case_5(self):\n # Test empty string.\n result = f_800(\"\")\n self.assertEqual(result, (0, 0, 0))\n def test_case_6(self):\n # Test text with numbers and punctuation.\n result = f_800(\"There are 4 numbers here: 1, 2, 3, and 4.\")\n self.assertEqual(result, (10, 27, 15))\n def test_case_7(self):\n # Test text with only whitespace and punctuation.\n result = f_800(\" , , !\")\n self.assertEqual(result, (3, 0, 0))\n def test_case_8(self):\n # Test text with multiple spaces between words.\n result = f_800(\"Multiple spaces here\")\n self.assertEqual(result, (3, 18, 12))\n def test_case_9(self):\n # Test a long text.\n long_text = \"This is a longer text designed to test the function's ability to handle more complex input, including a variety of characters and spaces.\"\n result = f_800(long_text)\n self.assertEqual(result, (23, 112, 22))", "apis": ["string.punctuation", "re.sub"], "libs": ["string", "re"], "doc": {"description": ["Counts the number of words, characters, and unique characters in a given text."], "note": ["This function considers whitespace-separated substrings as words.", "When counting characters, this function excludes whitespace and special", "characters (i.e. string.punctuation)."], "params": ["text (str): The input text to be analyzed."], "returns": ["tuple: A tuple containing three integers: the number of words,", "the number of characters,", "the number of unique characters."], "reqs": ["string", "re"], "raises": [], "example": [">>> f_800('Hello, world!')", "(2, 10, 7)", ">>> f_800('Python is awesome! ')", "(3, 15, 12)"]}} -{"task_id": "f_785", "prompt": "import pandas as pd\nfrom datetime import datetime\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n# Constants\nSTART_DATE = '2016-01-01'\nPERIODS = 13\nFREQ = 'WOM-2FRI'\nCATEGORIES = ['Electronics', 'Fashion', 'Home & Kitchen', 'Automotive', 'Sports']\n\ndef f_785(start_date=START_DATE, periods=PERIODS, freq=FREQ, categories=CATEGORIES):\n \"\"\"\n Create and visualize a sales report for different categories over a period of time.\n \n Functionality:\n - Generates a DataFrame containing sales data for given categories over a time range.\n - Visualizes the sales data using a line plot.\n \n Input:\n - start_date (str): The start date for the report in 'YYYY-MM-DD' format. Default is '2016-01-01'.\n - periods (int): The number of periods for the report. Default is 13.\n - freq (str): The frequency of dates to be generated. Default is 'WOM-2FRI' (WeekOfMonth-2nd Friday).\n - categories (list): List of categories to include in the report. Default is ['Electronics', 'Fashion', 'Home & Kitchen', 'Automotive', 'Sports'].\n\n Output:\n - Returns a DataFrame containing the sales data.\n - Returns the Matplotlib Axes object for the plot.\n\n Requirements:\n - pandas\n - datetime\n - matplotlib.pyplot\n - numpy\n\n Example:\n >>> df, ax = f_785(start_date='2020-01-01', periods=5, freq='W-MON', categories=['Electronics', 'Fashion'])\n >>> df\n Date Category Sales\n 0 2020-01-06 Electronics 272\n 1 2020-01-06 Fashion 147\n 2 2020-01-13 Electronics 217\n 3 2020-01-13 Fashion 292\n 4 2020-01-20 Electronics 423\n 5 2020-01-20 Fashion 351\n 6 2020-01-27 Electronics 295\n 7 2020-01-27 Fashion 459\n 8 2020-02-03 Electronics 109\n 9 2020-02-03 Fashion 311\n \"\"\"", "canonical_solution": " np.random.seed(0) # Ensure reproducible sales figures\n date_range = pd.date_range(start=start_date, periods=periods, freq=freq)\n report_data = []\n\n for date in date_range:\n for category in categories:\n sales = np.random.randint(low=100, high=500)\n report_data.append([date, category, sales])\n\n sales_df = pd.DataFrame(report_data, columns=['Date', 'Category', 'Sales'])\n\n fig, ax = plt.subplots(figsize=(12, 8))\n sales_df.pivot(index='Date', columns='Category', values='Sales').plot(ax=ax)\n ax.set_title('Category-wise Sales Trends')\n ax.grid(True)\n \n return sales_df, ax", "test": "import unittest\nimport pandas as pd\n# Unit tests for the f_785 function\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n \"\"\"Test with default parameters.\"\"\"\n df, ax = f_785()\n self.assertIsInstance(df, pd.DataFrame)\n self.assertTrue(all(x in df.columns for x in ['Date', 'Category', 'Sales']))\n self.assertEqual(len(df['Category'].unique()), 5)\n self.assertEqual(ax.get_title(), 'Category-wise Sales Trends')\n def test_case_2(self):\n \"\"\"Test with custom start_date and periods.\"\"\"\n df, _ = f_785(start_date='2021-01-01', periods=7)\n self.assertTrue(df['Date'].min() >= pd.to_datetime('2021-01-01'))\n self.assertEqual(df['Date'].nunique(), 7)\n expected_rows = 7 * len(['Electronics', 'Fashion', 'Home & Kitchen', 'Automotive', 'Sports'])\n self.assertEqual(len(df), expected_rows)\n \n def test_case_3(self):\n \"\"\"Test with a different frequency and custom categories.\"\"\"\n df, _ = f_785(freq='W-TUE', categories=['Books', 'Games'])\n self.assertEqual(len(df['Category'].unique()), 2)\n self.assertTrue(all(category in ['Books', 'Games'] for category in df['Category'].unique()))\n def test_case_4(self):\n \"\"\"Test with all parameters customized.\"\"\"\n df, _ = f_785(start_date='2019-06-01', periods=10, freq='W-WED', categories=['Food', 'Clothing'])\n self.assertEqual(len(df['Category'].unique()), 2)\n self.assertTrue(all(category in ['Food', 'Clothing'] for category in df['Category'].unique()))\n def test_case_5(self):\n \"\"\"Test with a single category.\"\"\"\n df, _ = f_785(categories=['Electronics'])\n self.assertTrue(all(df['Category'] == 'Electronics'))\n self.assertEqual(len(df), 13) # Default periods", "apis": ["numpy.random.randint", "numpy.random", "numpy.random.seed", "pandas.date_range", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["numpy", "pandas", "matplotlib"], "doc": {"description": ["Create and visualize a sales report for different categories over a period of time.", "Functionality:", "- Generates a DataFrame containing sales data for given categories over a time range.", "- Visualizes the sales data using a line plot.", "Input:", "- start_date (str): The start date for the report in 'YYYY-MM-DD' format. Default is '2016-01-01'.", "- periods (int): The number of periods for the report. Default is 13.", "- freq (str): The frequency of dates to be generated. Default is 'WOM-2FRI' (WeekOfMonth-2nd Friday).", "- categories (list): List of categories to include in the report. Default is ['Electronics', 'Fashion', 'Home & Kitchen', 'Automotive', 'Sports'].", "Output:", "- Returns a DataFrame containing the sales data.", "- Returns the Matplotlib Axes object for the plot."], "note": [], "params": [], "returns": [], "reqs": ["pandas", "datetime", "matplotlib.pyplot", "numpy"], "raises": [], "example": [">>> df, ax = f_785(start_date='2020-01-01', periods=5, freq='W-MON', categories=['Electronics', 'Fashion'])", ">>> df", "Date Category Sales", "0 2020-01-06 Electronics 272", "1 2020-01-06 Fashion 147", "2 2020-01-13 Electronics 217", "3 2020-01-13 Fashion 292", "4 2020-01-20 Electronics 423", "5 2020-01-20 Fashion 351", "6 2020-01-27 Electronics 295", "7 2020-01-27 Fashion 459", "8 2020-02-03 Electronics 109", "9 2020-02-03 Fashion 311"]}} -{"task_id": "f_585", "prompt": "import pandas as pd\nimport numpy as np\n\ndef f_585(data, cols):\n \"\"\"\n Turn the provided data into a DataFrame and then calculate the correlation matrix of numeric columns.\n \n Parameters:\n - data (list): List of lists with the data, where the length of the inner list equals the number of columns\n - cols (list): List of column names\n \n Returns:\n - correlation_matrix (pd.DataFrame): The correlation matrix.\n\n Requirements:\n - pandas\n - numpy\n \n Example:\n >>> correlation_matrix = f_585([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], ['x', 'y', 'z'])\n >>> print(correlation_matrix)\n x y z\n x 1.000000 0.596040 0.866025\n y 0.596040 1.000000 0.114708\n z 0.866025 0.114708 1.000000\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data, columns=cols)\n \n df_np = np.array(df)\n df = pd.DataFrame(df_np, columns=cols)\n \n correlation_matrix = df.corr()\n return correlation_matrix", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))\n def test_case_2(self):\n df = pd.DataFrame([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))\n def test_case_3(self):\n df = pd.DataFrame([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))\n \n def test_case_4(self):\n df = pd.DataFrame([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))\n def test_case_5(self):\n df = pd.DataFrame([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0], [-7.0, -8.0, -9.0]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0], [-7.0, -8.0, -9.0]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))", "apis": ["numpy.array", "pandas.DataFrame"], "libs": ["numpy", "pandas"], "doc": {"description": ["Turn the provided data into a DataFrame and then calculate the correlation matrix of numeric columns."], "note": [], "params": ["data (list): List of lists with the data, where the length of the inner list equals the number of columns", "cols (list): List of column names"], "returns": ["correlation_matrix (pd.DataFrame): The correlation matrix."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> correlation_matrix = f_585([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], ['x', 'y', 'z'])", ">>> print(correlation_matrix)", "x y z", "x 1.000000 0.596040 0.866025", "y 0.596040 1.000000 0.114708", "z 0.866025 0.114708 1.000000"]}} -{"task_id": "f_341", "prompt": "import string\nimport matplotlib.pyplot as plt\n\n\ndef f_341(s):\n \"\"\"\n Calculate the frequency of each letter in a string and return a bar chart of frequencies.\n Results are case-insensitive. If non-string input is provided, function will throw an error.\n\n Parameters:\n s (str): The string to calculate letter frequencies.\n\n Returns:\n tuple: A tuple containing:\n - dict: A dictionary with the frequency of each letter.\n - Axes: The plot of 'Letter Frequencies' with 'Letters' on the x-axis and 'Frequency'\n on the y-axis.\n\n Requirements:\n - string\n - matplotlib.pyplot\n\n Example:\n >>> s = 'This is a test string.'\n >>> freqs, ax = f_341(s)\n >>> freqs\n {'a': 1, 'b': 0, 'c': 0, 'd': 0, 'e': 1, 'f': 0, 'g': 1, 'h': 1, 'i': 3, 'j': 0, 'k': 0, 'l': 0, 'm': 0, 'n': 1, 'o': 0, 'p': 0, 'q': 0, 'r': 1, 's': 4, 't': 4, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0}\n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n if not isinstance(s, str):\n raise TypeError(\"Expected string input\")\n\n LETTERS = string.ascii_lowercase\n\n s = s.lower()\n\n letter_counts = {letter: s.count(letter) for letter in LETTERS}\n\n fig, ax = plt.subplots()\n ax.bar(letter_counts.keys(), letter_counts.values())\n ax.set_xlabel(\"Letters\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(\"Letter Frequencies\")\n\n return letter_counts, ax", "test": "import unittest\nimport string\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with a simple sentence\n s = \"This is a test string.\"\n expected_output = {\n letter: s.lower().count(letter) for letter in string.ascii_lowercase\n }\n result, ax = f_341(s)\n self.assertEqual(result, expected_output)\n self.assertEqual(ax.get_title(), \"Letter Frequencies\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_2(self):\n # Test with a string having all alphabets\n s = \"abcdefghijklmnopqrstuvwxyz\"\n expected_output = {letter: 1 for letter in string.ascii_lowercase}\n result, ax = f_341(s)\n self.assertEqual(result, expected_output)\n self.assertEqual(ax.get_title(), \"Letter Frequencies\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_3(self):\n # Test with a string having no alphabets\n s = \"1234567890!@#$%^&*()\"\n expected_output = {letter: 0 for letter in string.ascii_lowercase}\n result, ax = f_341(s)\n self.assertEqual(result, expected_output)\n self.assertEqual(ax.get_title(), \"Letter Frequencies\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_4(self):\n # Test with an empty string\n s = \"\"\n expected_output = {letter: 0 for letter in string.ascii_lowercase}\n result, ax = f_341(s)\n self.assertEqual(result, expected_output)\n self.assertEqual(ax.get_title(), \"Letter Frequencies\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_5(self):\n # Test error handling\n for invalid in [123, []]:\n with self.assertRaises(Exception):\n f_341(invalid)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["string.ascii_lowercase", "matplotlib.pyplot.subplots"], "libs": ["string", "matplotlib"], "doc": {"description": ["Calculate the frequency of each letter in a string and return a bar chart of frequencies.", "Results are case-insensitive. If non-string input is provided, function will throw an error."], "note": [], "params": ["s (str): The string to calculate letter frequencies."], "returns": ["tuple: A tuple containing:", "dict: A dictionary with the frequency of each letter.", "Axes: The plot of 'Letter Frequencies' with 'Letters' on the x-axis and 'Frequency'", "on the y-axis."], "reqs": ["string", "matplotlib.pyplot"], "raises": [], "example": [">>> s = 'This is a test string.'", ">>> freqs, ax = f_341(s)", ">>> freqs", "{'a': 1, 'b': 0, 'c': 0, 'd': 0, 'e': 1, 'f': 0, 'g': 1, 'h': 1, 'i': 3, 'j': 0, 'k': 0, 'l': 0, 'm': 0, 'n': 1, 'o': 0, 'p': 0, 'q': 0, 'r': 1, 's': 4, 't': 4, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0}", ">>> type(ax)", ""]}} -{"task_id": "f_829", "prompt": "import json\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\n\n\ndef f_829(json_data: str, key_path: list):\n \"\"\"\n Extracts and visualizes numerical data from a JSON structure based on a specified path of keys.\n\n Parameters:\n json_data (str): JSON formatted string.\n key_path (list): List of strings representing the nested keys to locate the data within the JSON.\n\n Returns:\n matplotlib.figure.Figure: A matplotlib figure showing a boxplot of the data values.\n\n Raises:\n KeyError: If a specified key is not found.\n ValueError: If no numeric data is found, or the data string is empty or corrupted.\n\n Examples:\n >>> json_data = '{\"level1\":{\"level2\":{\"data\":\"1,2,3,4\"}}}'\n >>> key_path = ['level1', 'level2', 'data']\n >>> fig = f_829(json_data, key_path)\n >>> isinstance(fig, plt.Figure)\n True\n \"\"\"", "canonical_solution": " try:\n data = json.loads(json_data)\n for key in key_path:\n data = data[key]\n values = np.fromstring(data, sep=\",\")\n\n if values.size == 0:\n raise ValueError(\"No numeric data found or empty data string.\")\n df = pd.DataFrame(values, columns=[\"Values\"])\n\n fig, ax = plt.subplots()\n sns.boxplot(data=df, ax=ax)\n return fig\n\n except json.decoder.JSONDecodeError as e:\n raise ValueError(f\"Input malformed: {e}\")\n except KeyError as e:\n raise KeyError(f\"Key error occurred: {e}\")\n except ValueError as e:\n raise ValueError(f\"Value error occurred: {e}\")", "test": "import unittest\nimport warnings\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_correct_data_extraction(self):\n \"\"\"Tests correct extraction and visualization from valid JSON data.\"\"\"\n json_data = '{\"level1\":{\"level2\":{\"data\":\"1,2,3,4\"}}}'\n key_path = [\"level1\", \"level2\", \"data\"]\n fig = f_829(json_data, key_path)\n self.assertIsInstance(fig, plt.Figure)\n def test_missing_key_error(self):\n \"\"\"Tests response to missing key in JSON data.\"\"\"\n json_data = '{\"level1\":{}}'\n key_path = [\"level1\", \"level2\", \"data\"]\n with self.assertRaises(KeyError):\n f_829(json_data, key_path)\n def test_corrupted_json(self):\n \"\"\"Tests response to malformed data.\"\"\"\n key_path = [\"level1\", \"level2\", \"data\"]\n for x in [\"{'level1':{}}\", '{\"level1\":{\"level' \"invalid\", \"\"]:\n with self.assertRaises(ValueError):\n f_829(x, key_path)\n def test_empty_data_value_error(self):\n \"\"\"Tests response to empty numeric data.\"\"\"\n json_data = '{\"level1\":{\"level2\":{\"data\":\"\"}}}'\n key_path = [\"level1\", \"level2\", \"data\"]\n with self.assertRaises(ValueError):\n f_829(json_data, key_path)\n def test_non_numeric_data_value_error(self):\n \"\"\"Tests response to non-numeric data.\"\"\"\n json_data = '{\"level1\":{\"level2\":{\"data\":\"a,b,c\"}}}'\n key_path = [\"level1\", \"level2\", \"data\"]\n with warnings.catch_warnings():\n warnings.simplefilter(\"ignore\")\n with self.assertRaises(ValueError):\n f_829(json_data, key_path)", "apis": ["json.decoder", "seaborn.boxplot", "json.loads", "numpy.fromstring", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["numpy", "json", "matplotlib", "pandas", "seaborn"], "doc": {"description": ["Extracts and visualizes numerical data from a JSON structure based on a specified path of keys."], "note": [], "params": ["json_data (str): JSON formatted string.", "key_path (list): List of strings representing the nested keys to locate the data within the JSON."], "returns": ["matplotlib.figure.Figure: A matplotlib figure showing a boxplot of the data values."], "reqs": [], "raises": ["KeyError: If a specified key is not found.", "ValueError: If no numeric data is found, or the data string is empty or corrupted."], "example": ["Examples:", ">>> json_data = '{\"level1\":{\"level2\":{\"data\":\"1,2,3,4\"}}}'", ">>> key_path = ['level1', 'level2', 'data']", ">>> fig = f_829(json_data, key_path)", ">>> isinstance(fig, plt.Figure)", "True"]}} -{"task_id": "f_872", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_872(rows=100, columns=3):\n \"\"\"\n Create a Pandas DataFrame with random alphabets in each cell.\n The DataFrame will have a specified number of rows and columns.\n Each column is named with a string from the list ['a', 'b', 'c', ...]\n depending on the number of columns specified.\n\n Parameters:\n - rows (int, optional): Number of rows in the DataFrame. Defaults to 100.\n - columns (int, optional): Number of columns in the DataFrame. Defaults to 3.\n\n Returns:\n DataFrame: A pandas DataFrame with random alphabets.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> np.random.seed(0)\n >>> df = f_872(5, 3)\n >>> print(df)\n a b c\n 0 m p v\n 1 a d d\n 2 h j t\n 3 v s e\n 4 x g y\n >>> df['a'].value_counts()\n a\n m 1\n a 1\n h 1\n v 1\n x 1\n Name: count, dtype: int64\n \"\"\"", "canonical_solution": " column_names = [\n chr(97 + i) for i in range(columns)\n ] # generate column names based on the number of columns\n values = list(\"abcdefghijklmnopqrstuvwxyz\")\n data = np.random.choice(values, size=(rows, columns))\n df = pd.DataFrame(data, columns=column_names)\n return df", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Tests case for function `f_872`.\"\"\"\n def test_dataframe_shape_default(self):\n \"\"\"Test if the DataFrame has default shape (100 rows, 3 columns) with default parameters.\"\"\"\n np.random.seed(1)\n df_test = f_872()\n self.assertEqual(df_test.shape, (100, 3))\n def test_dataframe_shape_custom_rows(self):\n \"\"\"Test if the DataFrame has the correct shape when a custom number of rows is specified.\"\"\"\n np.random.seed(2)\n df_test = f_872(50)\n self.assertEqual(df_test.shape, (50, 3))\n def test_dataframe_shape_custom_columns(self):\n \"\"\"Test if the DataFrame has the correct shape with a custom number of columns.\"\"\"\n np.random.seed(3)\n df_test = f_872(50, 5)\n self.assertEqual(df_test.shape, (50, 5))\n def test_dataframe_columns_default(self):\n \"\"\"Test if the DataFrame has default column names ['a', 'b', 'c'] with default parameters.\"\"\"\n np.random.seed(4)\n df_test = f_872()\n self.assertListEqual(list(df_test.columns), [\"a\", \"b\", \"c\"])\n def test_dataframe_columns_custom(self):\n \"\"\"Test if the DataFrame has the correct column names when a custom number of columns is specified.\"\"\"\n np.random.seed(5)\n df_test = f_872(columns=5)\n expected_columns = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n self.assertListEqual(list(df_test.columns), expected_columns)\n def test_dataframe_values(self):\n \"\"\"Test if each cell in the DataFrame contains a letter from the English alphabet.\"\"\"\n np.random.seed(6)\n df_test = f_872()\n for col in df_test.columns:\n self.assertTrue(\n set(df_test[col].unique()).issubset(set(\"abcdefghijklmnopqrstuvwxyz\"))\n )\n def test_dataframe_empty(self):\n \"\"\"Test if an empty DataFrame is created when 0 rows are specified.\"\"\"\n np.random.seed(7)\n df_test = f_872(0)\n self.assertEqual(df_test.shape, (0, 3))", "apis": ["numpy.random.choice", "pandas.DataFrame", "numpy.random"], "libs": ["numpy", "pandas"], "doc": {"description": ["Create a Pandas DataFrame with random alphabets in each cell.", "The DataFrame will have a specified number of rows and columns.", "Each column is named with a string from the list ['a', 'b', 'c', ...]", "depending on the number of columns specified."], "note": [], "params": ["rows (int, optional): Number of rows in the DataFrame. Defaults to 100.", "columns (int, optional): Number of columns in the DataFrame. Defaults to 3."], "returns": ["DataFrame: A pandas DataFrame with random alphabets."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> np.random.seed(0)", ">>> df = f_872(5, 3)", ">>> print(df)", "a b c", "0 m p v", "1 a d d", "2 h j t", "3 v s e", "4 x g y", ">>> df['a'].value_counts()", "a", "m 1", "a 1", "h 1", "v 1", "x 1", "Name: count, dtype: int64"]}} -{"task_id": "f_799", "prompt": "import re\nimport string\nimport random\n\n\ndef f_799(text: str, seed=None) -> str:\n \"\"\"\n Transforms a given string by removing special characters, normalizing whitespace,\n and randomizing character casing.\n\n Parameters:\n - text (str): The text string to be preprocessed.\n - seed (int, optional): Random seed for reproducibility. Defaults to None (not set).\n\n Returns:\n - str: The preprocessed text string.\n\n Requirements:\n - re\n - string\n - random\n\n Note:\n - This function considers special characters to be string punctuations.\n - Spaces, tabs, and newlines are replaced with with '_', '__', and '___' respectively.\n - To randomize casing, this function converts characters to uppercase with a 50% probability.\n\n Example:\n >>> f_799('Hello World!', 0)\n 'HeLlo___WORlD'\n >>> f_799('attention is all you need', 42)\n 'ATtENTIOn_IS_ALL_You_Need'\n \"\"\"", "canonical_solution": "\n if seed is not None:\n random.seed(seed)\n\n text = re.sub(\"[%s]\" % re.escape(string.punctuation), \"\", text)\n\n REPLACEMENTS = {\" \": \"_\", \"\\t\": \"__\", \"\\n\": \"___\"}\n for k, v in REPLACEMENTS.items():\n text = text.replace(k, v)\n\n text = \"\".join(random.choice([k.upper(), k]) for k in text)\n\n return text", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n result = f_799(\"Hello World!\", seed=1)\n self.assertNotIn(\" \", result, \"Spaces should be replaced.\")\n self.assertNotIn(\"!\", result, \"Special characters should be removed.\")\n self.assertEqual(\n len(result), len(\"Hello___World\"), \"Length should match processed input.\"\n )\n def test_case_2(self):\n result = f_799(\"Python!\", seed=2)\n self.assertNotIn(\"!\", result, \"Special characters should be removed.\")\n self.assertEqual(\n len(result), len(\"Python\"), \"Length should match processed input.\"\n )\n def test_case_3(self):\n result = f_799(\" \", seed=3)\n self.assertEqual(result, \"__\", \"Spaces should be replaced with underscores.\")\n def test_case_4(self):\n result = f_799(\"\\t\\n\", seed=4)\n self.assertEqual(\n result, \"_____\", \"Tab and newline should be replaced with underscores.\"\n )\n def test_case_5(self):\n result = f_799(\"a!b@c#\", seed=5)\n self.assertTrue(result.isalpha(), \"Output should only contain alphabets.\")\n self.assertEqual(\n len(result), len(\"abc\"), \"Length should match processed input.\"\n )\n def test_case_6(self):\n # Test with all types of whitespace characters\n result = f_799(\"a b\\tc\\nd\", seed=6)\n self.assertEqual(\n result.lower(),\n \"a_b__c___d\",\n \"Should replace all types of whitespaces correctly.\",\n )\n def test_case_7(self):\n # Test with a mix of alphanumeric and special characters\n result = f_799(\"a1! b2@ c3#\", seed=7)\n self.assertTrue(\n all(char.isalnum() or char == \"_\" for char in result),\n \"Should only contain alphanumeric characters and underscores.\",\n )\n def test_case_8(self):\n # Test with an empty string\n result = f_799(\"\", seed=8)\n self.assertEqual(result, \"\", \"Should handle empty string correctly.\")\n def test_case_9(self):\n # Test with a string that contains no special characters or whitespaces\n result = f_799(\"abcdefg\", seed=9)\n self.assertTrue(result.isalpha(), \"Should contain only letters.\")\n self.assertEqual(len(result), 7, \"Length should match the input.\")\n def test_case_10(self):\n # Test with a long string of repeated characters\n result = f_799(\"a\" * 50, seed=10)\n self.assertTrue(\n all(char.lower() == \"a\" for char in result),\n \"All characters should be 'a' or 'A'.\",\n )\n self.assertEqual(len(result), 50, \"Length should match the input.\")\n def test_case_11(self):\n # Test with only special characters\n result = f_799(\"!@#$%^&*\", seed=11)\n self.assertEqual(\n result, \"\", \"Should return an empty string for only special characters.\"\n )\n def test_case_12(self):\n # Test with numeric characters\n result = f_799(\"12345\", seed=13)\n self.assertTrue(result.isdigit(), \"Should contain only digits.\")\n self.assertEqual(len(result), 5, \"Length should match the input.\")\n def test_case_13(self):\n # Test with a string containing only whitespace characters\n result = f_799(\" \\t\\n\", seed=14)\n self.assertEqual(\n result,\n \"______\",\n \"Should replace all types of whitespaces correctly, with two underscores for tab and three for newline.\",\n )\n def test_case_14(self):\n # Test the randomness of uppercase conversion with a long string\n result = f_799(\"a\" * 100, seed=15)\n self.assertTrue(\n all(char.lower() == \"a\" for char in result),\n \"All characters should be 'a' or 'A'.\",\n )\n self.assertNotEqual(\n result, \"a\" * 100, \"Should have some uppercase transformations.\"\n )\n self.assertNotEqual(\n result, \"A\" * 100, \"Should have some lowercase transformations.\"\n )\n def test_case_15(self):\n # Test random seed impact\n result1 = f_799(\"test seed impact\", seed=42)\n result2 = f_799(\"test seed impact\", seed=42)\n self.assertEqual(\n result1, result2, \"Results with the same seed should be identical.\"\n )", "apis": ["string.punctuation", "random.seed", "random.choice", "re.sub", "re.escape"], "libs": ["string", "re", "random"], "doc": {"description": ["Transforms a given string by removing special characters, normalizing whitespace,", "and randomizing character casing."], "note": ["This function considers special characters to be string punctuations.", "Spaces, tabs, and newlines are replaced with with '_', '__', and '___' respectively.", "To randomize casing, this function converts characters to uppercase with a 50% probability."], "params": ["text (str): The text string to be preprocessed.", "seed (int, optional): Random seed for reproducibility. Defaults to None (not set)."], "returns": ["str: The preprocessed text string."], "reqs": ["re", "string", "random"], "raises": [], "example": [">>> f_799('Hello World!', 0)", "'HeLlo___WORlD'", ">>> f_799('attention is all you need', 42)", "'ATtENTIOn_IS_ALL_You_Need'"]}} -{"task_id": "f_883", "prompt": "import ssl\nimport os\nimport hashlib\n\n\ndef f_883(client_socket, cert_file, key_file, buffer_size=1024):\n \"\"\"\n This function secures a client socket using SSL/TLS and sends back the SHA256 hash of a file requested by the client. \n\n Parameters:\n - client_socket (socket.socket): The client socket that will be wrapped with SSL/TLS for secure communication.\n - cert_file (str): The file path to the SSL certificate to be used for the secure connection.\n - key_file (str): The file path to the SSL key corresponding to the certificate.\n - buffer_size (int, optional): The size of the buffer used to receive data from the client. Defaults to 1024 bytes.\n\n Returns:\n - str: The SHA256 hash of the requested file. If the requested file does not exist, returns 'File not found'. \n In case of an exception during processing, an error message is returned.\n\n Requirements:\n - ssl\n - os\n - hashlib\n\n Note:\n - This function assumes that the client requests a file by sending its path.\n - The function does not handle the opening or closing of the client_socket itself.\n - Error handling is basic and might need to be expanded based on specific use cases.\n \n Example:\n >>> # Server setup\n >>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n >>> server_socket.bind(('localhost', 443))\n >>> server_socket.listen(5)\n >>> cert_file = \"path/to/certificate.crt\"\n >>> key_file = \"path/to/private.key\"\n >>> # Accept client connection\n >>> client_socket, addr = server_socket.accept()\n >>> # Use f_883 function to handle the client request\n >>> file_hash = f_883(client_socket, cert_file, key_file)\n >>> print(\"Sent file hash:\", file_hash)\n >>> server_socket.close()\n \"\"\"", "canonical_solution": " context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)\n context.load_cert_chain(certfile=cert_file, keyfile=key_file)\n secure_socket = None\n try:\n secure_socket = context.wrap_socket(client_socket, server_side=True)\n request = secure_socket.recv(buffer_size).decode(\"utf-8\")\n\n if os.path.exists(request):\n with open(request, \"rb\") as file:\n sha256_hash = hashlib.sha256()\n for byte_block in iter(lambda: file.read(4096), b\"\"):\n sha256_hash.update(byte_block)\n response = sha256_hash.hexdigest()\n else:\n response = \"File not found\"\n\n secure_socket.send(response.encode(\"utf-8\"))\n except Exception as e:\n response = f\"Error: {str(e)}\"\n finally:\n if secure_socket:\n secure_socket.close()\n\n return response", "test": "import unittest\nfrom unittest.mock import MagicMock, patch\nimport ssl\nimport os\nimport hashlib\nclass TestCases(unittest.TestCase):\n \"\"\"Unit tests for f_883.\"\"\"\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_file_found(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function returns the correct SHA256 hash when the file exists.\"\"\"\n # Mocking the certificate and key file paths\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking the SSL context and secure socket\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Mocking the request and response\n mock_request = \"path/to/requested_file.txt\"\n mock_secure_socket.recv.return_value = mock_request.encode(\"utf-8\")\n # Mock file existence and content for hashing\n with patch(\"os.path.exists\") as mock_exists:\n mock_exists.return_value = True\n with patch(\n \"builtins.open\", unittest.mock.mock_open(read_data=b\"file content\")\n ) as mock_file:\n # Call the function\n result = f_883(mock_socket, cert_file, key_file)\n # Check if file was opened\n mock_file.assert_called_with(mock_request, \"rb\")\n # Create expected hash\n expected_hash = hashlib.sha256(b\"file content\").hexdigest()\n # Assertions\n self.assertEqual(result, expected_hash)\n mock_context.wrap_socket.assert_called_with(\n mock_socket, server_side=True\n )\n mock_secure_socket.send.assert_called()\n mock_secure_socket.close.assert_called()\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_file_not_found(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function returns 'File not found' if the requested file does not exist.\"\"\"\n # Mocking the certificate and key file paths\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking the SSL context and secure socket\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Mocking the request\n mock_request = \"path/to/nonexistent_file.txt\"\n mock_secure_socket.recv.return_value = mock_request.encode(\"utf-8\")\n # Mock file existence\n with patch(\"os.path.exists\") as mock_exists:\n mock_exists.return_value = False\n # Call the function\n result = f_883(mock_socket, cert_file, key_file)\n # Assertions\n self.assertEqual(result, \"File not found\")\n mock_context.wrap_socket.assert_called_with(mock_socket, server_side=True)\n mock_secure_socket.send.assert_called_with(\n \"File not found\".encode(\"utf-8\")\n )\n mock_secure_socket.close.assert_called()\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_exception_handling(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function handles exceptions properly.\"\"\"\n # Mocking the certificate and key file paths\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking the SSL context and setting up to raise an exception\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Configuring the secure_socket to raise an exception when recv is called\n mock_secure_socket.recv.side_effect = Exception(\"Test exception\")\n # Call the function and verify that it handles the exception\n result = f_883(mock_socket, cert_file, key_file)\n # Assertions\n self.assertTrue(\"Error: Test exception\" in result)\n mock_context.wrap_socket.assert_called_with(mock_socket, server_side=True)\n mock_secure_socket.close.assert_called()\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_f_883_empty_file(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function returns the correct SHA256 hash for an empty file.\"\"\"\n # Setup for empty file scenario\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking SSL context and secure socket\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Mocking the request for an empty file\n mock_request = \"path/to/empty_file.txt\"\n mock_secure_socket.recv.return_value = mock_request.encode(\"utf-8\")\n with patch(\"os.path.exists\") as mock_exists, patch(\n \"builtins.open\", unittest.mock.mock_open(read_data=b\"\")\n ) as mock_file: # Note the b'' for empty bytes\n mock_exists.return_value = True\n # Call the function\n result = f_883(mock_socket, cert_file, key_file)\n # Expected hash for an empty file\n expected_hash = hashlib.sha256(b\"\").hexdigest() # Hash of empty bytes\n # Assertions\n self.assertEqual(result, expected_hash)\n mock_file.assert_called_with(mock_request, \"rb\")\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_f_883_large_file(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function returns the correct SHA256 hash for a large file.\"\"\"\n # Setup for large file scenario\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking SSL context and secure socket\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Mocking the request for a large file\n mock_request = \"path/to/large_file.txt\"\n mock_secure_socket.recv.return_value = mock_request.encode(\"utf-8\")\n large_file_content = b\"a\" * 10**6 # 1 MB of data\n with patch(\"os.path.exists\") as mock_exists, patch(\n \"builtins.open\", unittest.mock.mock_open(read_data=large_file_content)\n ) as mock_file:\n mock_exists.return_value = True\n # Call the function\n result = f_883(mock_socket, cert_file, key_file)\n # Expected hash for the large file\n expected_hash = hashlib.sha256(large_file_content).hexdigest()\n # Assertions\n self.assertEqual(result, expected_hash)\n mock_file.assert_called_with(mock_request, \"rb\")", "apis": ["os.path.exists", "os.path", "ssl.SSLContext", "ssl.PROTOCOL_TLS_SERVER", "hashlib.sha256"], "libs": ["hashlib", "os", "ssl"], "doc": {"description": ["This function secures a client socket using SSL/TLS and sends back the SHA256 hash of a file requested by the client."], "note": ["This function assumes that the client requests a file by sending its path.", "The function does not handle the opening or closing of the client_socket itself.", "Error handling is basic and might need to be expanded based on specific use cases."], "params": ["client_socket (socket.socket): The client socket that will be wrapped with SSL/TLS for secure communication.", "cert_file (str): The file path to the SSL certificate to be used for the secure connection.", "key_file (str): The file path to the SSL key corresponding to the certificate.", "buffer_size (int, optional): The size of the buffer used to receive data from the client. Defaults to 1024 bytes."], "returns": ["str: The SHA256 hash of the requested file. If the requested file does not exist, returns 'File not found'.", "In case of an exception during processing, an error message is returned."], "reqs": ["ssl", "os", "hashlib"], "raises": [], "example": [">>> # Server setup", ">>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)", ">>> server_socket.bind(('localhost', 443))", ">>> server_socket.listen(5)", ">>> cert_file = \"path/to/certificate.crt\"", ">>> key_file = \"path/to/private.key\"", ">>> # Accept client connection", ">>> client_socket, addr = server_socket.accept()", ">>> # Use f_883 function to handle the client request", ">>> file_hash = f_883(client_socket, cert_file, key_file)", ">>> print(\"Sent file hash:\", file_hash)", ">>> server_socket.close()"]}} -{"task_id": "f_858", "prompt": "import requests\nfrom lxml import html\nimport pandas as pd\nimport sqlite3\n\n\ndef f_858(webpage_url: str, database_name: str = \"my_database.db\") -> int:\n \"\"\"\n This function parses HTML table data from a specified URL or local file and stores it into an SQLite database.\n The function handles different scenarios for fetching, processing, and storing data.\n\n Parameters:\n - webpage_url (str): The URL of the webpage or a local file path prefixed with \"file://\".\n - database_name (str): The name of the SQLite database file where the data is to be stored. Defaults to \"my_database.db\".\n\n Returns:\n - int: The number of rows in the parsed HTML table.\n\n Raises:\n - requests.RequestException: This exception is raised if there is a network issue in accessing the URL. \n This includes scenarios like connection errors, timeouts, and HTTP errors.\n - sqlite3.DatabaseError: This exception is raised in case of issues connecting to, or writing to, the SQLite database. \n This includes issues like invalid database names, write permissions, or SQL execution errors.\n\n Notes:\n - The function is designed to replace the table \"my_table\" in the specified SQLite database with new data each time it is called.\n - If the HTML content does not contain a table or if the table is empty, the function will return 0, indicating no rows were parsed and stored.\n - This function relies on the 'requests', 'lxml', 'pandas', and 'sqlite3' libraries for its operations.\n\n Requirements:\n - requests\n - lxml\n - pandas\n - sqlite3\n \n Example:\n >>> num_rows = f_858(\"http://example.com/tabledata\")\n >>> print(f\"Number of rows parsed: {num_rows}\")\n Number of rows parsed: 5\n \"\"\"", "canonical_solution": " try:\n if webpage_url.startswith(\"file://\"):\n with open(webpage_url[7:], \"r\", encoding=\"utf-8\") as file:\n content = file.read()\n else:\n response = requests.get(webpage_url, timeout=5)\n response.raise_for_status()\n content = response.content\n\n tree = html.fromstring(content)\n rows = tree.xpath(\"//tr\")\n data = [\n [cell.text_content().strip() for cell in row.xpath(\".//td\")] for row in rows\n ]\n\n # Create DataFrame\n df = pd.DataFrame(data)\n if df.empty:\n return 0\n\n # Store data in database\n conn = None\n try:\n conn = sqlite3.connect(database_name)\n df.to_sql(\"my_table\", conn, if_exists=\"replace\", index=False)\n finally:\n if conn:\n conn.close()\n\n return len(df)\n\n except requests.RequestException as e:\n raise requests.RequestException(f\"Error accessing URL {webpage_url}: {e}\")\n except sqlite3.DatabaseError as e:\n raise sqlite3.DatabaseError(f\"Database error with {database_name}: {e}\")", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport requests\nimport sqlite3\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_858.\"\"\"\n @patch(\"requests.get\")\n def test_valid_webpage_url(self, mock_get):\n \"\"\"\n Test processing HTML table data from a valid webpage URL.\n \"\"\"\n mock_response = MagicMock()\n mock_response.content = (\n b\"
1
\"\n )\n mock_response.status_code = 200\n mock_get.return_value = mock_response\n result = f_858(\"http://example.com\")\n self.assertEqual(result, 1)\n @patch(\n \"builtins.open\",\n new_callable=unittest.mock.mock_open,\n read_data=\"
1
\",\n )\n def test_local_file_url(self, mock_file):\n \"\"\"\n Test processing HTML table data from a local file.\n \"\"\"\n result = f_858(\"file:///path/to/file.html\")\n self.assertEqual(result, 1)\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"\n Test function behavior with an invalid URL.\n \"\"\"\n mock_get.side_effect = requests.RequestException(\"mocked request exception\")\n with self.assertRaises(requests.RequestException):\n f_858(\"http://invalid-url.com\")\n @patch(\"requests.get\")\n def test_empty_table(self, mock_get):\n \"\"\"\n Test handling an HTML page with an empty table.\n \"\"\"\n mock_response = MagicMock()\n mock_response.content = b\"
\"\n mock_response.status_code = 200\n mock_get.return_value = mock_response\n result = f_858(\"http://example.com/empty\")\n self.assertEqual(result, 0)\n @patch(\"requests.get\")\n @patch(\"sqlite3.connect\")\n def test_database_error(self, mock_connect, mock_get):\n \"\"\"\n Test function behavior when encountering a database error.\n \"\"\"\n # Mock the response from requests.get\n mock_response = MagicMock()\n mock_response.content = (\n b\"
Data
\"\n )\n mock_response.status_code = 200\n mock_get.return_value = mock_response\n # Simulate a database error\n mock_connect.side_effect = sqlite3.DatabaseError(\"mocked database error\")\n # Expect a DatabaseError to be raised\n with self.assertRaises(sqlite3.DatabaseError):\n f_858(\"http://example.com\", \"faulty_database.db\")\n @classmethod\n def tearDownClass(cls):\n \"\"\"Remove the database file with retries.\"\"\"\n if os.path.exists(\"my_database.db\"):\n os.remove(\"my_database.db\")", "apis": ["lxml.html.fromstring", "requests.get", "requests.RequestException", "sqlite3.DatabaseError", "pandas.DataFrame", "sqlite3.connect"], "libs": ["sqlite3", "pandas", "requests", "lxml"], "doc": {"description": ["This function parses HTML table data from a specified URL or local file and stores it into an SQLite database.", "The function handles different scenarios for fetching, processing, and storing data.", "Notes:", "- The function is designed to replace the table \"my_table\" in the specified SQLite database with new data each time it is called.", "- If the HTML content does not contain a table or if the table is empty, the function will return 0, indicating no rows were parsed and stored.", "- This function relies on the 'requests', 'lxml', 'pandas', and 'sqlite3' libraries for its operations."], "note": [], "params": ["webpage_url (str): The URL of the webpage or a local file path prefixed with \"file://\".", "database_name (str): The name of the SQLite database file where the data is to be stored. Defaults to \"my_database.db\"."], "returns": ["int: The number of rows in the parsed HTML table."], "reqs": ["requests", "lxml", "pandas", "sqlite3"], "raises": ["requests.RequestException: This exception is raised if there is a network issue in accessing the URL.", "This includes scenarios like connection errors, timeouts, and HTTP errors.", "sqlite3.DatabaseError: This exception is raised in case of issues connecting to, or writing to, the SQLite database.", "This includes issues like invalid database names, write permissions, or SQL execution errors."], "example": [">>> num_rows = f_858(\"http://example.com/tabledata\")", ">>> print(f\"Number of rows parsed: {num_rows}\")", "Number of rows parsed: 5"]}} -{"task_id": "f_356", "prompt": "import numpy as np\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndef f_356(n_components=2, N_SAMPLES=500, N_FEATURES=50, random_seed=None):\n \"\"\"\n Generate a high-dimensional dataset, run PCA to reduce its dimensionality, and then draw a heatmap of\n the covariance matrix of the transformed data.\n\n Parameters:\n n_components (int, optional): The number of components for PCA. Defaults to 2.\n N_SAMPLES (int, optional): Number of samples in the dataset. Defaults to 500.\n N_FEATURES (int, optional): Number of features in the dataset. Defaults to 50.\n random_seed (int, optional): Seed for the numpy and sklearn random number generator. Defaults to None.\n\n Returns:\n tuple:\n transformed_data (ndarray): The transformed data of shape (N_SAMPLES, n_components).\n heatmap_axes (Axes): The heatmap of the covariance matrix of the transformed data or None if n_components=1.\n\n Requirements:\n - numpy\n - sklearn.decomposition.PCA\n - matplotlib.pyplot\n - seaborn\n\n Example:\n >>> transformed, ax = f_356(n_components=2, random_seed=42)\n >>> transformed.shape\n (500, 2)\n \"\"\"", "canonical_solution": " np.random.seed(random_seed) # Ensuring reproducibility\n X = np.random.rand(N_SAMPLES, N_FEATURES)\n\n pca = PCA(n_components=n_components, random_state=random_seed)\n X_transformed = pca.fit_transform(X)\n\n if n_components == 1:\n return X_transformed, None\n\n fig, ax = plt.subplots(figsize=(10, 7))\n sns.heatmap(np.cov(X_transformed.T), annot=True, fmt=\".2f\", ax=ax)\n\n return X_transformed, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.decomposition import PCA\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.seed = 42\n # default parameters\n self.n_components = 2\n self.N_SAMPLES = 500\n self.N_FEATURES = 50\n def test_case_1(self):\n # Test basic functionality - results\n transformed_data, _ = f_356()\n self.assertEqual(transformed_data.shape, (self.N_SAMPLES, self.n_components))\n np.random.seed(self.seed)\n X = np.random.rand(self.N_SAMPLES, self.N_FEATURES)\n pca = PCA(n_components=self.n_components, random_state=self.seed)\n pca.fit(X)\n self.assertTrue(np.sum(pca.explained_variance_ratio_) <= 1)\n def test_case_2(self):\n # Test basic functionality - visualization\n _, heatmap_axes = f_356()\n self.assertIsNotNone(heatmap_axes)\n self.assertIsInstance(heatmap_axes, plt.Axes)\n self.assertEqual(len(heatmap_axes.get_xticklabels()), 2)\n self.assertEqual(len(heatmap_axes.get_yticklabels()), 2)\n def test_case_3(self):\n # Test n_components\n for n_components in [1, 10, self.N_FEATURES]:\n transformed_data, _ = f_356(\n n_components=n_components, N_FEATURES=self.N_FEATURES\n )\n self.assertEqual(transformed_data.shape, (self.N_SAMPLES, n_components))\n def test_case_4(self):\n # Test N_SAMPLES\n for n_samples in [self.n_components, 10, 50, 100]:\n transformed_data, _ = f_356(N_SAMPLES=n_samples)\n self.assertEqual(transformed_data.shape, (n_samples, self.n_components))\n def test_case_5(self):\n # Test N_FEATURES\n for n_features in [self.n_components, 10, 50, 100]:\n transformed_data, _ = f_356(N_FEATURES=n_features)\n self.assertEqual(\n transformed_data.shape, (self.N_SAMPLES, self.n_components)\n )\n def test_case_6(self):\n # Test random_seed\n transformed_data1, _ = f_356(random_seed=self.seed)\n transformed_data2, _ = f_356(random_seed=self.seed)\n np.testing.assert_array_equal(transformed_data1, transformed_data2)\n transformed_data2, _ = f_356(random_seed=0)\n with self.assertRaises(AssertionError):\n np.testing.assert_array_equal(transformed_data1, transformed_data2)\n def test_case_7(self):\n # Function should fail at invalid values\n with self.assertRaises(ValueError):\n # negative n_components\n f_356(n_components=-1)\n with self.assertRaises(ValueError):\n # more components than features\n f_356(n_components=self.N_FEATURES + 10, N_FEATURES=self.N_FEATURES)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.cov", "numpy.random", "numpy.random.seed", "seaborn.heatmap", "numpy.random.rand", "sklearn.decomposition.PCA", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "sklearn", "seaborn"], "doc": {"description": ["Generate a high-dimensional dataset, run PCA to reduce its dimensionality, and then draw a heatmap of", "the covariance matrix of the transformed data."], "note": [], "params": ["n_components (int, optional): The number of components for PCA. Defaults to 2.", "N_SAMPLES (int, optional): Number of samples in the dataset. Defaults to 500.", "N_FEATURES (int, optional): Number of features in the dataset. Defaults to 50.", "random_seed (int, optional): Seed for the numpy and sklearn random number generator. Defaults to None."], "returns": ["tuple:", "transformed_data (ndarray): The transformed data of shape (N_SAMPLES, n_components).", "heatmap_axes (Axes): The heatmap of the covariance matrix of the transformed data or None if n_components=1."], "reqs": ["numpy", "sklearn.decomposition.PCA", "matplotlib.pyplot", "seaborn"], "raises": [], "example": [">>> transformed, ax = f_356(n_components=2, random_seed=42)", ">>> transformed.shape", "(500, 2)"]}} -{"task_id": "f_893", "prompt": "import re\nimport pandas as pd\n\n\ndef f_893(input_string: str) -> pd.DataFrame:\n \"\"\"\n Process a multi-line string by replacing tabs with spaces and converting it into a pandas DataFrame.\n Each non-empty line of the input string is transformed into a separate row in the DataFrame.\n The function specifically filters out empty lines and replaces tabs with single spaces in the remaining lines.\n\n Parameters:\n - input_string (str): A multi-line string. Each line is separated by a newline character ('\\\\n').\n\n Returns:\n - pd.DataFrame: A DataFrame with a single column named 'Text'. Each row in this column corresponds to a non-empty\n line from the input string, with tabs replaced by spaces.\n\n Requirements:\n - re\n - pandas\n\n Note:\n - The function excludes lines that are empty or contain only whitespace.\n - Tabs within the lines are replaced with a single space. For instance, a '\\\\t' character in the input string\n will be replaced by ' ' in the output DataFrame.\n\n Example:\n >>> df = f_893('line a\\\\nfollowed by line b with a\\\\ttab\\\\n\\\\n...bye\\\\n')\n >>> print(df.head())\n Text\n 0 line a\n 1 followed by line b with a tab\n 2 ...bye\n \"\"\"", "canonical_solution": " input_string = input_string.replace('\\\\n', '\\n').replace('\\\\t', ' ')\n # Split the input string into lines and filter out empty lines\n lines = [line for line in input_string.split(\"\\n\") if line.strip()]\n # Replace tabs with spaces in each line\n lines = [re.sub(\"\\t\", \" \", line) for line in lines]\n # Create a DataFrame from the processed lines\n return pd.DataFrame(lines, columns=[\"Text\"])", "test": "import pandas as pd\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_893.\"\"\"\n def test_basic_string(self):\n \"\"\"\n Test with a basic multi-line string.\n \"\"\"\n input_str = \"line1\\nline2 with a\\ttab\\nline3\"\n expected_output = pd.DataFrame({\"Text\": [\"line1\", \"line2 with a tab\", \"line3\"]})\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)\n def test_empty_string(self):\n \"\"\"\n Test with an empty string.\n \"\"\"\n input_str = \"\"\n expected_output = pd.DataFrame(columns=[\"Text\"])\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)\n def test_string_with_empty_lines(self):\n \"\"\"\n Test with a string that contains empty lines.\n \"\"\"\n input_str = \"line1\\n\\nline3\"\n expected_output = pd.DataFrame({\"Text\": [\"line1\", \"line3\"]})\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)\n def test_string_with_only_tabs(self):\n \"\"\"\n Test with a string that contains only tabs.\n \"\"\"\n input_str = \"\\t\\t\\t\"\n expected_output = pd.DataFrame(columns=[\"Text\"])\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)\n def test_string_with_mixed_whitespace(self):\n \"\"\"\n Test with a string that contains a mix of tabs and spaces.\n \"\"\"\n input_str = \"line1\\n \\t \\nline3\"\n expected_output = pd.DataFrame({\"Text\": [\"line1\", \"line3\"]})\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)", "apis": ["pandas.DataFrame", "re.sub"], "libs": ["pandas", "re"], "doc": {"description": ["Process a multi-line string by replacing tabs with spaces and converting it into a pandas DataFrame.", "Each non-empty line of the input string is transformed into a separate row in the DataFrame.", "The function specifically filters out empty lines and replaces tabs with single spaces in the remaining lines."], "note": ["The function excludes lines that are empty or contain only whitespace.", "Tabs within the lines are replaced with a single space. For instance, a '\\\\t' character in the input string", "will be replaced by ' ' in the output DataFrame."], "params": ["input_string (str): A multi-line string. Each line is separated by a newline character ('\\\\n')."], "returns": ["pd.DataFrame: A DataFrame with a single column named 'Text'. Each row in this column corresponds to a non-empty", "line from the input string, with tabs replaced by spaces."], "reqs": ["re", "pandas"], "raises": [], "example": [">>> df = f_893('line a\\\\nfollowed by line b with a\\\\ttab\\\\n\\\\n...bye\\\\n')", ">>> print(df.head())", "Text", "0 line a", "1 followed by line b with a tab", "2 ...bye"]}} -{"task_id": "f_816", "prompt": "import os\nimport shutil\n\n\ndef f_816(path, delimiter=\"/\"):\n \"\"\"\n Splits a given file path by a specific delimiter and computes disk usage for each directory component.\n\n Parameters:\n - path (str): The file path to split.\n - delimiter (str, optional): The delimiter to use for splitting the path. Default is '/'.\n\n Returns:\n list: A list of tuples where each tuple contains a path component and its disk usage as a dictionary.\n The disk usage dictionary contains keys 'total', 'used', and 'free'.\n\n Raises:\n - ValueError: If the 'path' is empty, not a string, or contain invalid components.\n - FileNotFoundError: If the 'path' does not exist in the filesystem.\n\n Requirements:\n - os\n - shutil\n\n Examples:\n >>> f_816('Docs/src', '/')\n [('Docs', {'total': 100, 'used': 50, 'free': 50}), ('src', {'total': 200, 'used': 100, 'free': 100})]\n\n >>> f_816('a/b', '/')\n [('a', {'total': 300, 'used': 150, 'free': 150}), ('b', {'total': 400, 'used': 200, 'free': 200})]\n \"\"\"", "canonical_solution": " if not path or not isinstance(path, str):\n raise ValueError(\"Path must be a non-empty string\")\n if not os.path.exists(path):\n raise FileNotFoundError(f\"Path '{path}' does not exist\")\n\n path_components = path.strip(delimiter).split(delimiter)\n if not all(path_components):\n raise ValueError(\"Path contains invalid components\")\n\n results = []\n for index, component in enumerate(path_components):\n sub_path = delimiter.join(path_components[: index + 1])\n if not sub_path.startswith(delimiter):\n sub_path = delimiter + sub_path\n usage = shutil.disk_usage(sub_path)\n results.append(\n (component, {\"total\": usage.total, \"used\": usage.used, \"free\": usage.free})\n )\n\n return results", "test": "import unittest\nfrom collections import namedtuple\nfrom unittest.mock import patch\nimport tempfile\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n DiskUsage = namedtuple(\"DiskUsage\", [\"total\", \"used\", \"free\"])\n # Setup realistic disk usage values for different directories\n self.mock_usage_root = DiskUsage(500000000000, 300000000000, 200000000000)\n self.mock_usage_docs = DiskUsage(100000000000, 50000000000, 50000000000)\n self.mock_usage_src = DiskUsage(50000000000, 25000000000, 25000000000)\n self.mock_usage_home = DiskUsage(200000000000, 100000000000, 100000000000)\n def disk_usage_side_effect(self, path):\n # Helper for mocking\n if path.endswith(\"src\"):\n return self.mock_usage_src\n elif path.endswith(\"Docs\"):\n return self.mock_usage_docs\n elif path == \"/home\":\n return self.mock_usage_home\n return self.mock_usage_root\n @patch(\"os.path.exists\")\n def test_nonexist_path(self, mock_exists):\n # Test function should raise error if path does not exist\n mock_exists.return_value = True\n with tempfile.TemporaryDirectory() as tmpdirname:\n non_exist_path = os.path.join(tmpdirname, \"nonexist\")\n with self.assertRaises(FileNotFoundError):\n f_816(non_exist_path)\n def test_invalid_path(self):\n # Test function should raise error if path is not valid\n with self.assertRaises(ValueError):\n f_816(\"\")\n with self.assertRaises(ValueError):\n f_816(123)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_varied_path(self, mock_disk_usage, mock_exists):\n # Test functionality\n mock_exists.return_value = True\n mock_disk_usage.side_effect = self.disk_usage_side_effect\n result = f_816(\"Docs/src\")\n expected = [\n (\n \"Docs\",\n {\n \"total\": self.mock_usage_docs.total,\n \"used\": self.mock_usage_docs.used,\n \"free\": self.mock_usage_docs.free,\n },\n ),\n (\n \"src\",\n {\n \"total\": self.mock_usage_src.total,\n \"used\": self.mock_usage_src.used,\n \"free\": self.mock_usage_src.free,\n },\n ),\n ]\n self.assertEqual(result, expected)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_deep_nested_path(self, mock_disk_usage, mock_exists):\n # Test nested paths\n mock_exists.return_value = True\n mock_disk_usage.return_value = self.mock_usage_src\n deep_path = \"Docs/src/Projects/Python/Example\"\n result = f_816(deep_path)\n expected = [\n (\"Docs\", self.mock_usage_src._asdict()),\n (\"src\", self.mock_usage_src._asdict()),\n (\"Projects\", self.mock_usage_src._asdict()),\n (\"Python\", self.mock_usage_src._asdict()),\n (\"Example\", self.mock_usage_src._asdict()),\n ]\n self.assertEqual(result, expected)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_single_directory(self, mock_disk_usage, mock_exists):\n # Test function works on single directory\n mock_exists.return_value = True\n mock_disk_usage.return_value = self.mock_usage_home\n result = f_816(\"home\")\n expected = [(\"home\", self.mock_usage_home._asdict())]\n self.assertEqual(result, expected)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_path_with_multiple_delimiters(self, mock_disk_usage, mock_exists):\n # Test should fail if there is an invalid path component\n mock_exists.return_value = True\n mock_disk_usage.side_effect = lambda path: {\n \"/Docs\": self.mock_usage_docs,\n \"/Docs/src\": self.mock_usage_src,\n }.get(path, self.mock_usage_root)\n with self.assertRaises(ValueError):\n result = f_816(\"Docs//src\")\n expected = [\n (\"Docs\", self.mock_usage_docs._asdict()),\n (\"\", {\"total\": 0, \"used\": 0, \"free\": 0}),\n (\"src\", self.mock_usage_src._asdict()),\n ]\n self.assertEqual(result, expected)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_path_with_trailing_delimiter(self, mock_disk_usage, mock_exists):\n # Test should handle trailing delimiter\n mock_exists.return_value = True\n mock_disk_usage.side_effect = lambda path: {\n \"/Docs\": self.mock_usage_docs,\n \"/Docs/src\": self.mock_usage_src,\n }.get(path, self.mock_usage_root)\n result = f_816(\"Docs/src/\")\n expected = [\n (\"Docs\", self.mock_usage_docs._asdict()),\n (\"src\", self.mock_usage_src._asdict()),\n ]\n self.assertEqual(result, expected)", "apis": ["os.path.exists", "shutil.disk_usage", "os.path"], "libs": ["shutil", "os"], "doc": {"description": ["Splits a given file path by a specific delimiter and computes disk usage for each directory component.", ">>> f_816('a/b', '/')", "[('a', {'total': 300, 'used': 150, 'free': 150}), ('b', {'total': 400, 'used': 200, 'free': 200})]"], "note": [], "params": ["path (str): The file path to split.", "delimiter (str, optional): The delimiter to use for splitting the path. Default is '/'."], "returns": ["list: A list of tuples where each tuple contains a path component and its disk usage as a dictionary.", "The disk usage dictionary contains keys 'total', 'used', and 'free'."], "reqs": ["os", "shutil"], "raises": ["ValueError: If the 'path' is empty, not a string, or contain invalid components.", "FileNotFoundError: If the 'path' does not exist in the filesystem."], "example": ["Examples:", ">>> f_816('Docs/src', '/')", "[('Docs', {'total': 100, 'used': 50, 'free': 50}), ('src', {'total': 200, 'used': 100, 'free': 100})]"]}} -{"task_id": "f_347", "prompt": "import numpy as np\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\n\n\ndef f_347(P, T, tensor_shape=(3, 3, 3)):\n \"\"\"\n Calculate the product of a matrix \"P\" and a 3D tensor \"T\" with numpy and then apply PCA to reduce the\n dimensionality of the result. The resulting 2D data is then visualized.\n Note: This function only accepts numpy matrices/arrays.\n\n Parameters:\n P (numpy.ndarray): The input matrix.\n T (numpy.ndarray): The input tensor. Must have same shape as tensor_shape.\n tensor_shape (tuple, optional): The shape of the tensor. Must be same as T.shape. Default is (3, 3, 3).\n\n Returns:\n pca_result (numpy.ndarray): The result of PCA of shape (N, 2), where N is the number of rows in matrix P.\n ax (matplotlib.axes.Axes): Plot of 'PCA Result Visualization', with 'Principal Component 1' on the x-axis\n and 'Principal Component 2' on the y-axis.\n\n\n\n Requirements:\n - numpy\n - sklearn.decomposition\n - matplotlib.pyplot\n\n Example:\n >>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])\n >>> T = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]])\n >>> pca_result, ax = f_347(P, T)\n >>> pca_result.shape\n (3, 2)\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if not (isinstance(P, np.ndarray) and isinstance(T, np.ndarray)):\n raise TypeError(\"Expected inputs to be numpy arrays\")\n\n if not T.shape == tensor_shape:\n raise ValueError(\"Provided tensor does not match the specified tensor_shape.\")\n\n result = np.tensordot(P, T, axes=[1, 1]).swapaxes(0, 1)\n\n # Reshape the result for PCA\n result = result.reshape(result.shape[0], -1)\n pca = PCA(n_components=2)\n pca_result = pca.fit_transform(result)\n\n fig, ax = plt.subplots()\n ax.scatter(pca_result[:, 0], pca_result[:, 1])\n ax.set_title(\"PCA Result Visualization\")\n ax.set_xlabel(\"Principal Component 1\")\n ax.set_ylabel(\"Principal Component 2\")\n\n return pca_result, ax", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n # Set up common matrices and tensors for testing\n self.TENSOR_SHAPE = (3, 3, 3)\n self.P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1]])\n self.T = np.random.rand(*self.TENSOR_SHAPE)\n self.T_zeros = np.zeros(self.TENSOR_SHAPE)\n self.T_ones = np.ones(self.TENSOR_SHAPE)\n def test_case_1(self):\n # Test results and plot correctness\n pca_result, ax = f_347(self.P, self.T)\n self._common_assertions(pca_result, ax)\n def test_case_2(self):\n # Function should fail when input types are invalid\n with self.assertRaises(Exception):\n f_347(\"not a numpy array\", self.T, self.TENSOR_SHAPE)\n with self.assertRaises(Exception):\n f_347(self.P, \"not a numpy array\", self.TENSOR_SHAPE)\n with self.assertRaises(Exception):\n f_347([], [], self.TENSOR_SHAPE)\n def test_case_3(self):\n # Function should fail when input shapes are invalid\n T_incorrect_shape = np.random.rand(2, 2, 2)\n with self.assertRaises(Exception):\n f_347(self.P, T_incorrect_shape, self.TENSOR_SHAPE)\n with self.assertRaises(Exception):\n f_347(np.array([]), np.array([]), self.TENSOR_SHAPE)\n def test_case_4(self):\n # Test custom shapes\n P = np.random.rand(5, 4)\n T = np.random.rand(5, 4, 4)\n pca_result, ax = f_347(P, T, tensor_shape=T.shape)\n self._common_assertions(pca_result, ax)\n def test_case_5(self):\n # Test with zeros\n pca_result, ax = f_347(self.P, self.T_zeros)\n self._common_assertions(pca_result, ax)\n def test_case_6(self):\n # Adjusting the matrix and tensor to have a slight variation\n P = np.array([[1.01, 0.01, 0.01], [0.01, 1.01, 0.01], [0.01, 0.01, 1.01]])\n T = np.ones(self.TENSOR_SHAPE) + 0.01 * np.random.rand(*self.TENSOR_SHAPE)\n pca_result, ax = f_347(P, T)\n # Assert that the PCA results don't produce NaN values and that there's a reduction in dimensionality\n self.assertFalse(np.isnan(pca_result).any())\n self.assertEqual(pca_result.shape[1], 2)\n # Also check common assertions\n self._common_assertions(pca_result, ax)\n def _common_assertions(self, pca_result, ax):\n # Common assertions for shape and plot labels\n self.assertEqual(pca_result.shape[1], 2)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"PCA Result Visualization\")\n self.assertEqual(ax.get_xlabel(), \"Principal Component 1\")\n self.assertEqual(ax.get_ylabel(), \"Principal Component 2\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.ndarray", "numpy.tensordot", "matplotlib.pyplot.subplots", "sklearn.decomposition.PCA"], "libs": ["numpy", "matplotlib", "sklearn"], "doc": {"description": ["Calculate the product of a matrix \"P\" and a 3D tensor \"T\" with numpy and then apply PCA to reduce the", "dimensionality of the result. The resulting 2D data is then visualized."], "note": ["This function only accepts numpy matrices/arrays."], "params": ["P (numpy.ndarray): The input matrix.", "T (numpy.ndarray): The input tensor. Must have same shape as tensor_shape.", "tensor_shape (tuple, optional): The shape of the tensor. Must be same as T.shape. Default is (3, 3, 3)."], "returns": ["pca_result (numpy.ndarray): The result of PCA of shape (N, 2), where N is the number of rows in matrix P.", "ax (matplotlib.axes.Axes): Plot of 'PCA Result Visualization', with 'Principal Component 1' on the x-axis", "and 'Principal Component 2' on the y-axis."], "reqs": ["numpy", "sklearn.decomposition", "matplotlib.pyplot"], "raises": [], "example": [">>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])", ">>> T = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]])", ">>> pca_result, ax = f_347(P, T)", ">>> pca_result.shape", "(3, 2)", ">>> type(ax)", ""]}} -{"task_id": "f_398", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_398(column, data):\n \"\"\"\n Analyzes a list of stock data and calculates the sum, mean, minimum, and maximum\n values for a specified column.\n\n Parameters:\n - column (str): The name of the column to analyze. Valid options are 'Date', 'Open', 'High',\n 'Low', 'Close', and 'Volume'.\n - data (list of lists): A list where each element is a list representing stock data for a single day.\n Each inner list should contain values in the following order:\n 'Date', 'Open', 'High', 'Low', 'Close', 'Volume'.\n Function will raise ValueError if the structure is not as expected.\n Returns:\n - dict: A dictionary containing the calculated 'sum', 'mean', 'min' (minimum), and 'max' (maximum)\n for the specified column. If the input data is empty, 'sum' will be 0, and 'mean', 'min', and\n 'max' will be NaN.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]]\n >>> results = f_398('Open', data)\n >>> results\n {'sum': 100, 'mean': 100.0, 'min': 100, 'max': 100}\n >>> type(results)\n \n \"\"\"", "canonical_solution": " valid_columns = [\"Date\", \"Open\", \"High\", \"Low\", \"Close\", \"Volume\"]\n if column not in valid_columns:\n raise ValueError(f\"Invalid column name.\")\n if not isinstance(data, list) or (\n len(data) > 0\n and not all(\n isinstance(row, list) and len(row) == len(valid_columns) for row in data\n )\n ):\n raise ValueError(\n \"Data must be a list of lists, with each inner list matching the length of the column names.\"\n )\n\n df = pd.DataFrame(data, columns=valid_columns)\n column_data = df[column]\n\n result = {\n \"sum\": np.sum(column_data) if not column_data.empty else 0,\n \"mean\": np.mean(column_data) if not column_data.empty else float(\"nan\"),\n \"min\": np.min(column_data) if not column_data.empty else float(\"nan\"),\n \"max\": np.max(column_data) if not column_data.empty else float(\"nan\"),\n }\n\n return result", "test": "import unittest\nimport numpy as np\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n def assertDictAlmostEqual(self, d1, d2, msg=None):\n # Helper function for testing\n for k, v in d1.items():\n if isinstance(v, float) and np.isnan(v):\n self.assertTrue(np.isnan(d2[k]), msg or f\"{k} not almost equal\")\n else:\n self.assertAlmostEqual(v, d2[k], msg=msg or f\"{k} not equal\")\n def test_case_1(self):\n # Test with valid data for a specific column\n data = [\n [datetime(2022, 1, 1), 100, 105, 95, 102, 10000],\n [datetime(2022, 1, 2), 102, 108, 100, 105, 15000],\n [datetime(2022, 1, 3), 105, 110, 103, 108, 20000],\n ]\n result = f_398(\"Open\", data)\n expected_result = {\n \"sum\": 307,\n \"mean\": 102.33333333333333,\n \"min\": 100,\n \"max\": 105,\n }\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_2(self):\n # Test with empty data list\n data = []\n result = f_398(\"Open\", data)\n expected_result = {\n \"sum\": 0,\n \"mean\": float(\"nan\"),\n \"min\": float(\"nan\"),\n \"max\": float(\"nan\"),\n }\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_3(self):\n # Test with an invalid column name\n data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]]\n with self.assertRaises(ValueError):\n f_398(\"InvalidColumn\", data)\n def test_case_4(self):\n # Test with NaN values in the target column\n data = [\n [datetime(2022, 1, 1), np.nan, 105, 95, 102, 10000],\n [datetime(2022, 1, 2), 102, np.nan, 100, 105, 15000],\n [datetime(2022, 1, 3), 105, np.nan, 103, 108, 20000],\n ]\n result = f_398(\"Open\", data)\n expected_result = {\"sum\": 207, \"mean\": 103.5, \"min\": 102, \"max\": 105}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_5(self):\n # Test with all values in the target column being the same\n data = [[datetime(2022, 1, 1), 100, 100, 100, 100, 10000]] * 3\n result = f_398(\"Open\", data)\n expected_result = {\"sum\": 300, \"mean\": 100, \"min\": 100, \"max\": 100}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_6(self):\n # Test for handling mixed data types within a single column\n data = [\n [datetime(2022, 1, 1), 100, 105, 95, 102, 10000],\n [datetime(2022, 1, 2), \"102\", 108, 100, 105, 15000],\n ]\n with self.assertRaises(TypeError):\n f_398(\"Open\", data)\n def test_case_7(self):\n # Test with extremely large values in the target column\n data = [[datetime(2022, 1, 1), 1e18, 1.05e18, 0.95e18, 1.02e18, 10000]]\n result = f_398(\"Open\", data)\n expected_result = {\"sum\": 1e18, \"mean\": 1e18, \"min\": 1e18, \"max\": 1e18}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_8(self):\n # Test with a single row of data\n data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]]\n result = f_398(\"Open\", data)\n expected_result = {\"sum\": 100, \"mean\": 100, \"min\": 100, \"max\": 100}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_9(self):\n # Test with a very large dataset to check performance/scalability\n large_data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]] * 10000\n result = f_398(\"Open\", large_data)\n expected_result = {\"sum\": 1000000, \"mean\": 100, \"min\": 100, \"max\": 100}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_10(self):\n # Test for column case sensitivity\n data = [\n [datetime(2022, 1, 1), 100, 105, 95, 102, 10000],\n ]\n with self.assertRaises(ValueError):\n f_398(\"open\", data)\n def test_case_11(self):\n # Test with incorrect data\n data = \"Incorrect data type\"\n with self.assertRaises(ValueError):\n f_398(\"Open\", data)\n def test_case_12(self):\n # Test for data list containing lists of varying lengths\n data = [\n [datetime(2022, 1, 1), 100, 105, 95, 102, 10000],\n [datetime(2022, 1, 2), 102, 108, 100],\n ]\n with self.assertRaises(ValueError):\n f_398(\"Open\", data)\n def test_case_13(self):\n # Test for data list containing elements other than lists (mixed types)\n data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000], \"Not a list\"]\n with self.assertRaises(ValueError):\n f_398(\"Open\", data)\n def test_case_14(self):\n # Test for a correctly structured and typed data list but with an empty inner list\n data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000], []]\n with self.assertRaises(ValueError):\n f_398(\"Open\", data)", "apis": ["numpy.mean", "numpy.min", "numpy.max", "numpy.sum", "pandas.DataFrame"], "libs": ["numpy", "pandas"], "doc": {"description": ["Analyzes a list of stock data and calculates the sum, mean, minimum, and maximum", "values for a specified column."], "note": [], "params": ["column (str): The name of the column to analyze. Valid options are 'Date', 'Open', 'High',", "'Low', 'Close', and 'Volume'.", "data (list of lists): A list where each element is a list representing stock data for a single day.", "Each inner list should contain values in the following order:", "'Date', 'Open', 'High', 'Low', 'Close', 'Volume'.", "Function will raise ValueError if the structure is not as expected."], "returns": ["dict: A dictionary containing the calculated 'sum', 'mean', 'min' (minimum), and 'max' (maximum)", "for the specified column. If the input data is empty, 'sum' will be 0, and 'mean', 'min', and", "'max' will be NaN."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]]", ">>> results = f_398('Open', data)", ">>> results", "{'sum': 100, 'mean': 100.0, 'min': 100, 'max': 100}", ">>> type(results)", ""]}} -{"task_id": "f_839", "prompt": "import requests\nimport json\nfrom bs4 import BeautifulSoup\n\n\ndef f_839(url: str, file_name: str = \"Output.txt\") -> str:\n \"\"\"\n Scrape the title from a specified web page, save it in JSON format to a given file, \n and append to the file if it exists.\n\n Parameters:\n - url (str): The URL of the web page from which the title is to be scraped.\n - file_name (str, optional): The name of the file to save the scraped title. \n If the file already exists, the new data is appended. Defaults to 'Output.txt'.\n\n Returns:\n - str: The file path where the scraped title is saved.\n\n Requirements:\n - requests\n - json\n - bs4\n\n Notes:\n - If the web page does not have a title, 'None' is saved as the title value in the JSON data.\n - Data is appended to the specified file in JSON format, with each title on a new line.\n\n Example:\n >>> f_839(\"http://example.com\")\n 'Output.txt'\n >>> f_839(\"http://another-example.com\", \"AnotherOutput.txt\")\n 'AnotherOutput.txt'\n \"\"\"", "canonical_solution": " response = requests.get(url, timeout=5)\n soup = BeautifulSoup(response.text, \"html.parser\")\n title = soup.title.string if soup.title else None\n data = {\"title\": title}\n json_data = json.dumps(data)\n with open(file_name, \"a\", encoding=\"utf-8\") as f:\n f.write(json_data + \"\\n\")\n return file_name", "test": "import unittest\nfrom unittest.mock import patch, mock_open\nimport requests\nimport json\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_839\"\"\"\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_scrape_title_page_1(self, mock_file):\n \"\"\"Test that the title is scraped from a web page and saved to a file\"\"\"\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = b\"Test Page 1\"\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\")\n self.assertEqual(file_path, \"Output.txt\")\n mock_file().write.assert_called_once_with(\n json.dumps({\"title\": \"Test Page 1\"}) + \"\\n\"\n )\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_scrape_title_page_2(self, mock_file):\n \"\"\"Test that the title is scraped from a web page and saved to a file\"\"\"\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = b\"Test Page 2\"\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\", \"AnotherOutput.txt\")\n self.assertEqual(file_path, \"AnotherOutput.txt\")\n mock_file().write.assert_called_once_with(\n json.dumps({\"title\": \"Test Page 2\"}) + \"\\n\"\n )\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_invalid_url(self, mock_file):\n \"\"\"Test that an exception is raised when the URL is invalid\"\"\"\n with self.assertRaises(requests.RequestException):\n f_839(\"http://invalid-url\")\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_page_without_title(self, mock_file):\n \"\"\"Test that 'None' is saved as the title when the web page does not have a title\"\"\"\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = b\"\"\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\")\n self.assertEqual(file_path, \"Output.txt\")\n mock_file().write.assert_called_once_with(\n json.dumps({\"title\": None}) + \"\\n\"\n )\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_very_long_title(self, mock_file):\n \"\"\"Test that a very long title is saved correctly\"\"\"\n long_title = \"A\" * 1024 # A very long title of 1024 characters\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = f\"{long_title}\".encode()\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\")\n self.assertEqual(file_path, \"Output.txt\")\n mock_file().write.assert_called_once_with(\n json.dumps({\"title\": long_title}) + \"\\n\"\n )\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=json.dumps({\"title\": \"Existing Title\"}) + \"\\n\",\n )\n def test_append_to_existing_file(self, mock_file):\n \"\"\"Test that data is appended to an existing file\"\"\"\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = b\"New Title\"\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\")\n self.assertEqual(file_path, \"Output.txt\")\n mock_file().write.assert_called_with(\n json.dumps({\"title\": \"New Title\"}) + \"\\n\"\n )", "apis": ["bs4.BeautifulSoup", "json.dumps", "requests.get"], "libs": ["bs4", "json", "requests"], "doc": {"description": ["Scrape the title from a specified web page, save it in JSON format to a given file,", "and append to the file if it exists.", "Notes:", "- If the web page does not have a title, 'None' is saved as the title value in the JSON data.", "- Data is appended to the specified file in JSON format, with each title on a new line."], "note": [], "params": ["url (str): The URL of the web page from which the title is to be scraped.", "file_name (str, optional): The name of the file to save the scraped title.", "If the file already exists, the new data is appended. Defaults to 'Output.txt'."], "returns": ["str: The file path where the scraped title is saved."], "reqs": ["requests", "json", "bs4"], "raises": [], "example": [">>> f_839(\"http://example.com\")", "'Output.txt'", ">>> f_839(\"http://another-example.com\", \"AnotherOutput.txt\")", "'AnotherOutput.txt'"]}} -{"task_id": "f_376", "prompt": "import pandas as pd\nimport re\nimport random\n\n\ndef f_376(data_list, seed=None):\n \"\"\"\n Removes a random comma-separated value (treated as a \"substring\") from each string\n in a list and returns a pandas DataFrame containing the original and modified strings.\n\n Parameters:\n - data_list (list of str): A list of comma-separated strings. The function will remove\n leading and trailing whitespaces first before processing.\n - seed (int, optional): Seed for the random number generator for reproducibility.\n Default is None, which uses system time.\n\n Returns:\n - DataFrame: A pandas DataFrame with columns 'Original String' and 'Modified String'.\n\n Requirements:\n - pandas\n - re\n - random\n\n Example:\n >>> f_376(['lamp, bag, mirror', 'table, chair, bag, lamp'], seed=42)\n Original String Modified String\n 0 lamp, bag, mirror lamp, bag\n 1 table, chair, bag, lamp chair, bag, lamp\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n df = pd.DataFrame([s.strip() for s in data_list], columns=[\"Original String\"])\n\n modified_strings = []\n for s in data_list:\n substrings = re.split(\", \", s)\n random_substring = random.choice(substrings)\n modified_s = (\n s.replace(\", \" + random_substring, \"\")\n if \", \" + random_substring in s\n else s.replace(random_substring + \", \", \"\")\n )\n modified_strings.append(modified_s)\n\n df[\"Modified String\"] = modified_strings\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.columns = [\"Original String\", \"Modified String\"]\n def test_case_1(self):\n # Test basic case\n input_data = [\"apple, orange, banana\", \"car, bike, plane\"]\n result = f_376(input_data, seed=42)\n self._test_dataframe(result, input_data)\n def test_case_2(self):\n # Test single character\n input_data = [\"a, b, c, d, e\", \"f, g, h, i, j\"]\n result = f_376(input_data, seed=42)\n self._test_dataframe(result, input_data)\n def test_case_3(self):\n # Test single numeric characters\n input_data = [\"1, 2, 3\", \"4, 5, 6, 7\"]\n result = f_376(input_data, seed=42)\n self._test_dataframe(result, input_data)\n def test_case_4(self):\n # Test with an empty list\n input_data = []\n result = f_376(input_data, seed=42)\n self.assertTrue(result.empty)\n def test_case_5(self):\n # Test with strings without commas\n input_data = [\"apple\", \"car\"]\n result = f_376(input_data, seed=42)\n # Ensure dataframe has correct columns\n self.assertListEqual(list(result.columns), self.columns)\n # Ensure 'Modified String' is the same as 'Original String' for single values\n for orig, mod in zip(result[\"Original String\"], result[\"Modified String\"]):\n self.assertEqual(orig.strip(), mod)\n def test_case_6(self):\n # Test strings with leading and trailing spaces\n input_data = [\" apple, orange, banana \", \" car, bike, plane\"]\n expected_data = [\"apple, orange, banana\", \"car, bike, plane\"]\n result = f_376(input_data, seed=42)\n self._test_dataframe(result, expected_data)\n def test_case_7(self):\n # Test strings where the same value appears multiple times\n input_data = [\"apple, apple, banana\", \"car, car, bike, plane\"]\n result = f_376(input_data, seed=42)\n # Special case where substrings might be duplicated\n for orig, mod in zip(result[\"Original String\"], result[\"Modified String\"]):\n diff = len(orig.split(\", \")) - len(mod.split(\", \"))\n self.assertTrue(diff in [0, 1]) # Either no change or one substring removed\n def test_case_8(self):\n # Test reproducibility with the same seed\n input_data = [\"apple, orange, banana\", \"car, bike, plane\"]\n result1 = f_376(input_data, seed=42)\n result2 = f_376(input_data, seed=42)\n pd.testing.assert_frame_equal(result1, result2)\n def test_case_9(self):\n # Test difference with different seeds\n input_data = [\"apple, orange, banana\", \"car, bike, plane\"]\n result1 = f_376(input_data, seed=42)\n result2 = f_376(input_data, seed=43)\n self.assertFalse(result1.equals(result2))\n def _test_dataframe(self, df, input_data):\n # Ensure dataframe has correct columns\n self.assertListEqual(list(df.columns), self.columns)\n # Ensure 'Modified String' has one less substring than 'Original String'\n for orig, mod in zip(df[\"Original String\"], df[\"Modified String\"]):\n self.assertTrue(orig in input_data) # Ensure original string is from input\n self.assertEqual(len(orig.split(\", \")) - 1, len(mod.split(\", \")))", "apis": ["re.split", "pandas.DataFrame", "random.seed", "random.choice"], "libs": ["pandas", "re", "random"], "doc": {"description": ["Removes a random comma-separated value (treated as a \"substring\") from each string", "in a list and returns a pandas DataFrame containing the original and modified strings."], "note": [], "params": ["data_list (list of str): A list of comma-separated strings. The function will remove", "leading and trailing whitespaces first before processing.", "seed (int, optional): Seed for the random number generator for reproducibility.", "Default is None, which uses system time."], "returns": ["DataFrame: A pandas DataFrame with columns 'Original String' and 'Modified String'."], "reqs": ["pandas", "re", "random"], "raises": [], "example": [">>> f_376(['lamp, bag, mirror', 'table, chair, bag, lamp'], seed=42)", "Original String Modified String", "0 lamp, bag, mirror lamp, bag", "1 table, chair, bag, lamp chair, bag, lamp"]}} -{"task_id": "f_855", "prompt": "import requests\nfrom pathlib import Path\nimport zipfile\n\n# Constants\nDOWNLOAD_DIR = Path(\"downloads\")\nZIP_DIR = Path(\"unzipped_files\")\n\n\ndef f_855(url, filename):\n \"\"\"\n Downloads and extracts a zip file from a specified URL.\n\n Parameters:\n url (str): The URL of the zip file to download.\n filename (str): The filename under which the downloaded zip file will be saved.\n\n Returns:\n tuple: A tuple containing a status message and a list of filenames in the unzipped directory, or an empty list if extraction fails.\n\n Note:\n the status message will contain \"Error\" when:\n - Network-related exceptions are raised if the download fails.\n - File-related exceptions are raised if there is an issue with file handling or extraction.\n\n Requirements:\n - requests\n - pathlib.Path\n - zipfile\n\n Example:\n >>> f_855('http://example.com/myfile.zip', 'myfile.zip')\n ('Download and extraction successful', ['file1.txt', 'file2.txt'])\n \"\"\"", "canonical_solution": " try:\n # Download the file\n response = requests.get(url, stream=True, timeout=5)\n if response.status_code == 200:\n filepath = DOWNLOAD_DIR / filename\n filepath.parent.mkdir(parents=True, exist_ok=True)\n\n with open(filepath, \"wb\") as handle:\n for data in response.iter_content():\n handle.write(data)\n\n # Unzip the file\n zip_dir = ZIP_DIR / filename[:-4]\n zip_dir.mkdir(parents=True, exist_ok=True)\n\n with zipfile.ZipFile(filepath, \"r\") as zip_ref:\n zip_ref.extractall(zip_dir)\n\n return \"Download and extraction successful\", [\n file.name for file in zip_dir.iterdir()\n ]\n return (\n f\"Download failed: HTTP status code {response.status_code}\",\n [],\n )\n except requests.exceptions.RequestException as e:\n return f\"Error: {e}\", []\n except zipfile.BadZipFile as e:\n return f\"Error: Invalid zip file: {e}\", []", "test": "import unittest\nfrom unittest.mock import MagicMock, patch\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_855.\"\"\"\n def test_successful_download_and_extraction(self):\n \"\"\"Test a successful download and extraction.\"\"\"\n result = f_855(\n \"https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-zip-file.zip\",\n \"test.zip\",\n )\n self.assertIn(\"Download and extraction successful\", result[0])\n self.assertTrue(len(result[1]) > 0)\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"Test an invalid URL.\"\"\"\n mock_get.return_value.status_code = 404\n result = f_855(\"http://invalidurl.com/file.zip\", \"test.zip\")\n self.assertIn(\"Download failed\", result[0])\n self.assertEqual(result[1], [])\n @patch(\"requests.get\")\n def test_non_200_http_response(self, mock_get):\n \"\"\"Test a non-200 HTTP response.\"\"\"\n mock_get.return_value.status_code = 404\n result = f_855(\"http://example.com/file.zip\", \"test.zip\")\n self.assertIn(\"Download failed\", result[0])\n self.assertEqual(result[1], [])\n @patch(\"requests.get\")\n def test_network_error(self, mock_get):\n \"\"\"Test a network error.\"\"\"\n mock_get.side_effect = requests.exceptions.ConnectionError\n result = f_855(\"http://example.com/file.zip\", \"test.zip\")\n self.assertIn(\"Error\", result[0])\n self.assertEqual(result[1], [])\n @patch(\"builtins.open\", new_callable=MagicMock)\n @patch(\"requests.get\")\n @patch(\"zipfile.ZipFile\")\n def test_corrupted_zip_file(self, mock_zip, mock_get, mock_open):\n \"\"\"Test a corrupted zip file.\"\"\"\n # Mock the response to simulate a successful download\n mock_response = MagicMock()\n mock_response.status_code = 200\n mock_response.iter_content = MagicMock(return_value=[b\"data\"])\n mock_get.return_value = mock_response\n # Mock the zipfile to raise a BadZipFile exception\n mock_zip.side_effect = zipfile.BadZipFile\n # Run the function\n result = f_855(\"http://example.com/corrupted.zip\", \"corrupted.zip\")\n # Check that the result indicates an error related to zip file extraction\n self.assertIn(\"Error\", result[0])\n self.assertIsInstance(result[1], list)\n self.assertEqual(len(result[1]), 0)\n @patch(\"requests.get\")\n def test_request_exception(self, mock_get):\n \"\"\"Test a network error.\"\"\"\n # Mock the requests.get to raise a RequestException\n mock_get.side_effect = requests.exceptions.RequestException\n # Run the function with a sample URL and filename\n result = f_855(\"http://example.com/file.zip\", \"test.zip\")\n # Check that the result indicates an error related to the network request\n self.assertIn(\"Error\", result[0])\n self.assertIsInstance(result[1], list)\n self.assertEqual(len(result[1]), 0)\n def tearDown(self):\n shutil.rmtree(DOWNLOAD_DIR, ignore_errors=True)\n shutil.rmtree(ZIP_DIR, ignore_errors=True)", "apis": ["zipfile.ZipFile", "requests.get", "zipfile.BadZipFile", "requests.exceptions", "pathlib.Path"], "libs": ["requests", "zipfile", "pathlib"], "doc": {"description": ["Downloads and extracts a zip file from a specified URL."], "note": ["the status message will contain \"Error\" when:", "Network-related exceptions are raised if the download fails.", "File-related exceptions are raised if there is an issue with file handling or extraction."], "params": ["url (str): The URL of the zip file to download.", "filename (str): The filename under which the downloaded zip file will be saved."], "returns": ["tuple: A tuple containing a status message and a list of filenames in the unzipped directory, or an empty list if extraction fails."], "reqs": ["requests", "pathlib.Path", "zipfile"], "raises": [], "example": [">>> f_855('http://example.com/myfile.zip', 'myfile.zip')", "('Download and extraction successful', ['file1.txt', 'file2.txt'])"]}} -{"task_id": "f_589", "prompt": "import pandas as pd\nfrom itertools import combinations\n\n# Constants\nMIN_PERCENTAGE = 0.75\n\ndef f_589(data, cols, percentage):\n \"\"\"\n Find all combinations of columns from a given DataFrame so that the absolute correlation between them is greater than a certain threshold.\n\n Parameters:\n - data (list): List of lists with the data, where the length of the inner list equals the number of columns\n - cols (list): List of column names\n - percentage (float): The threshold for the absolute correlation.\n\n Returns:\n - corr_combinations (list): A list of tuples where each tuple contains two column names.\n\n Requirements:\n - pandas\n - itertools\n\n Example:\n >>> result = f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.9)\n >>> print(result)\n [('x', 'y')]\n \"\"\"", "canonical_solution": " if not 0 <= percentage <= 1:\n raise ValueError('Percentage must be between 0 and 1')\n df = pd.DataFrame(data, columns=cols)\n corr_matrix = df.corr().abs()\n columns = corr_matrix.columns\n corr_combinations = []\n\n for col1, col2 in combinations(columns, 2):\n if corr_matrix.loc[col1, col2] > percentage:\n corr_combinations.append((col1, col2))\n\n return corr_combinations", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.9), [('x', 'y')])\n def test_case_2(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.5), [('x', 'y'), ('x', 'z'), ('y', 'z')])\n def test_case_3(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.1), [('x', 'y'), ('x', 'z'), ('y', 'z')])\n def test_case_4(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.0), [('x', 'y'), ('x', 'z'), ('y', 'z')])\n def test_case_5(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 1.0), [])", "apis": ["itertools.combinations", "pandas.DataFrame"], "libs": ["pandas", "itertools"], "doc": {"description": ["Find all combinations of columns from a given DataFrame so that the absolute correlation between them is greater than a certain threshold."], "note": [], "params": ["data (list): List of lists with the data, where the length of the inner list equals the number of columns", "cols (list): List of column names", "percentage (float): The threshold for the absolute correlation."], "returns": ["corr_combinations (list): A list of tuples where each tuple contains two column names."], "reqs": ["pandas", "itertools"], "raises": [], "example": [">>> result = f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.9)", ">>> print(result)", "[('x', 'y')]"]}} -{"task_id": "f_770", "prompt": "from collections import Counter\nimport itertools\nimport string\n\n\ndef f_770(word: str) -> dict:\n \"\"\"\n Create a dictionary containing all possible two-letter combinations of the lowercase English alphabets. \n The dictionary values represent the frequency of these two-letter combinations in the given word.\n If a combination does not appear in the word, its value will be 0.\n\n Requirements:\n - collections.Counter\n - itertools\n - string\n \n Parameters:\n - word (str): The input string containing alphabetic characters.\n\n Returns:\n - dict: A dictionary with keys as two-letter alphabet combinations and values as their counts in the word.\n\n Requirements:\n - The function uses the `collections.Counter` library to count the occurrences of two-letter combinations.\n - The function uses the `itertools.permutations` method to generate all two-letter combinations of alphabets.\n - The function uses the `string` library to get a string of lowercase alphabets.\n\n Example:\n >>> f_770('abcdef')\n {'ab': 1, 'ac': 0, 'ad': 0, ..., 'yx': 0, 'yz': 0, 'za': 0, ..., 'zx': 0, 'zy': 0}\n \"\"\"", "canonical_solution": " ALPHABETS = string.ascii_lowercase\n # Generate all two-letter combinations of alphabets\n permutations = [''.join(x) for x in itertools.permutations(ALPHABETS, 2)]\n combinations = permutations + [x*2 for x in ALPHABETS]\n \n # Generate all two-letter combinations in the word\n word_combinations = [''.join(x) for x in zip(word, word[1:])]\n # Count the occurrences of each two-letter combination in the word\n word_counter = Counter(word_combinations)\n\n # Create the dictionary with the counts\n return {key: word_counter.get(key, 0) for key in combinations}", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n result = f_770('abcdef')\n self.assertEqual(result['ab'], 1)\n self.assertEqual(result['ac'], 0)\n self.assertEqual(result['bc'], 1)\n self.assertEqual(result['cb'], 0)\n self.assertEqual(result['zz'], 0)\n \n def test_case_2(self):\n result = f_770('aabbcc')\n self.assertEqual(result['aa'], 1)\n self.assertEqual(result['ab'], 1)\n self.assertEqual(result['ba'], 0)\n self.assertEqual(result['bb'], 1)\n self.assertEqual(result['bc'], 1)\n \n def test_case_3(self):\n result = f_770('fedcba')\n self.assertEqual(result['fe'], 1)\n self.assertEqual(result['ef'], 0)\n self.assertEqual(result['dc'], 1)\n self.assertEqual(result['ba'], 1)\n self.assertEqual(result['zz'], 0)\n def test_case_4(self):\n result = f_770('cadbfe')\n self.assertEqual(result['ca'], 1)\n self.assertEqual(result['ad'], 1)\n self.assertEqual(result['db'], 1)\n self.assertEqual(result['fe'], 1)\n self.assertEqual(result['zz'], 0)\n def test_case_5(self):\n result = f_770('')\n self.assertEqual(result['ab'], 0)\n self.assertEqual(result['zz'], 0)", "apis": ["string.ascii_lowercase", "itertools.permutations", "collections.Counter"], "libs": ["string", "collections", "itertools"], "doc": {"description": ["Create a dictionary containing all possible two-letter combinations of the lowercase English alphabets.", "The dictionary values represent the frequency of these two-letter combinations in the given word.", "If a combination does not appear in the word, its value will be 0."], "note": [], "params": ["word (str): The input string containing alphabetic characters."], "returns": ["dict: A dictionary with keys as two-letter alphabet combinations and values as their counts in the word."], "reqs": ["collections.Counter", "itertools", "string", "The function uses the `collections.Counter` library to count the occurrences of two-letter combinations.", "The function uses the `itertools.permutations` method to generate all two-letter combinations of alphabets.", "The function uses the `string` library to get a string of lowercase alphabets."], "raises": [], "example": [">>> f_770('abcdef')", "{'ab': 1, 'ac': 0, 'ad': 0, ..., 'yx': 0, 'yz': 0, 'za': 0, ..., 'zx': 0, 'zy': 0}"]}} -{"task_id": "f_868", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import MinMaxScaler\n\n# Constants\nPLOT_TITLE = \"Scaled Values\"\n\n\ndef f_868(data_dict):\n \"\"\"\n Scales the values in a given dictionary using MinMaxScaler and plots the scaled data.\n\n Parameters:\n - data_dict (dict): A dictionary where keys represent column names and values are lists of numerical data.\n The values may contain missing data (None), which are handled by dropping them before scaling.\n\n Returns:\n - pandas.DataFrame containing the scaled data.\n - matplotlib Axes object that displays the plot of the scaled data.\n\n Requirements:\n - pandas\n - scikit-learn\n - matplotlib\n\n Example:\n >>> data = {'a': [1, 2, None, 4], 'b': [5, None, 7, 8]}\n >>> scaled_df, plot_ax = f_868(data)\n >>> scaled_df\n a b\n 0 0.0 0.0\n 1 1.0 1.0\n >>> plot_ax.get_title()\n 'Scaled Values'\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data_dict).dropna()\n\n if df.empty:\n ax = plt.gca()\n ax.set_title(PLOT_TITLE)\n return df, ax\n\n scaler = MinMaxScaler()\n scaled_data = scaler.fit_transform(df)\n df_scaled = pd.DataFrame(scaled_data, columns=df.columns)\n\n ax = df_scaled.plot()\n ax.set_title(PLOT_TITLE)\n\n return df_scaled, ax", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Unit tests for the function.\"\"\"\n def test_empty_data(self):\n \"\"\"\n Test with an empty dictionary. Should return an empty DataFrame and a plot object.\n \"\"\"\n result_df, result_ax = f_868({})\n self.assertTrue(result_df.empty)\n self.assertIsNotNone(result_ax)\n def test_all_none_data(self):\n \"\"\"\n Test with a dictionary where all values are None. Should return an empty DataFrame and a plot object.\n \"\"\"\n data = {\"a\": [None, None], \"b\": [None, None]}\n result_df, result_ax = f_868(data)\n self.assertTrue(result_df.empty)\n self.assertIsNotNone(result_ax)\n def test_normal_data(self):\n \"\"\"\n Test with a normal data dictionary. Should return a non-empty DataFrame and a plot object.\n \"\"\"\n data = {\"a\": [1, 2, 3], \"b\": [4, 5, 6]}\n result_df, result_ax = f_868(data)\n self.assertEqual(result_ax.get_title(), \"Scaled Values\")\n self.assertFalse(result_df.empty)\n self.assertEqual(result_df.shape, (3, 2))\n self.assertIsNotNone(result_ax)\n def test_with_missing_values(self):\n \"\"\"\n Test data with some missing values. Missing values should be dropped, and scaled data should be returned.\n \"\"\"\n data = {\"a\": [1, None, 3], \"b\": [4, 5, None]}\n result_df, result_ax = f_868(data)\n self.assertEqual(result_df.shape, (1, 2)) # Only one row without missing values\n self.assertIsNotNone(result_ax)\n def test_with_negative_values(self):\n \"\"\"\n Test data with negative values. Should handle negative values correctly and return scaled data.\n \"\"\"\n data = {\"a\": [-1, -2, -3], \"b\": [1, 2, 3]}\n result_df, result_ax = f_868(data)\n self.assertFalse(result_df.empty)\n self.assertEqual(result_df.shape, (3, 2))\n self.assertIsNotNone(result_ax)", "apis": ["sklearn.preprocessing.MinMaxScaler", "matplotlib.pyplot.gca", "pandas.DataFrame"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["Scales the values in a given dictionary using MinMaxScaler and plots the scaled data."], "note": [], "params": ["data_dict (dict): A dictionary where keys represent column names and values are lists of numerical data.", "The values may contain missing data (None), which are handled by dropping them before scaling."], "returns": ["pandas.DataFrame containing the scaled data.", "matplotlib Axes object that displays the plot of the scaled data."], "reqs": ["pandas", "scikit-learn", "matplotlib"], "raises": [], "example": [">>> data = {'a': [1, 2, None, 4], 'b': [5, None, 7, 8]}", ">>> scaled_df, plot_ax = f_868(data)", ">>> scaled_df", "a b", "0 0.0 0.0", "1 1.0 1.0", ">>> plot_ax.get_title()", "'Scaled Values'"]}} -{"task_id": "f_895", "prompt": "import collections\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_895(data_dict):\n \"\"\"\n Analyze the uniformity of a distribution represented by a dictionary of categories and their counts,\n and create a description to introduce this distribution.\n\n Parameters:\n - data_dict (dict): A dictionary with categories as keys and counts as values.\n\n Returns:\n - tuple: A tuple containing:\n - matplotlib.axes._subplots.AxesSubplot: The axes object of the histogram.\n - str: A message indicating whether the distribution is uniform (\"The distribution is uniform.\")\n or not (\"The distribution is not uniform.\").\n\n Note:\n - If 'data_dict' is empty, the function returns None and a message \"The distribution is uniform.\"\n indicating that an empty distribution is considered uniform by default.\n - If 'data_dict' is not empty, it calculates the average count of the categories.\n - The distribution is considered uniform if the absolute difference between each count and the\n average count is less than or equal to 1e-5.\n - If any count's absolute difference with the average count is more than 1e-5, the distribution\n is considered not uniform.\n - The function then creates a histogram of the counts using matplotlib, with the number of bins\n being the lesser of 10 or the number of unique counts. The histogram's x-ticks are labeled with\n the category names.\n\n Requirements:\n - collections\n - numpy\n - matplotlib\n\n Example:\n >>> data = {'A': 2, 'B': 3, 'C': 4, 'D': 1, 'E': 2}\n >>> ax, message = f_895(data)\n >>> print(message)\n The distribution is not uniform.\n \"\"\"", "canonical_solution": " if not data_dict:\n return None, \"The distribution is uniform.\"\n\n data_counter = collections.Counter(data_dict)\n counts = list(data_counter.values())\n avg_count = sum(counts) / len(counts)\n uniform = all(abs(count - avg_count) <= 1e-5 for count in counts)\n message = (\n \"The distribution is uniform.\"\n if uniform\n else \"The distribution is not uniform.\"\n )\n\n _, ax = plt.subplots()\n ax.hist(\n counts,\n bins=np.linspace(min(counts), max(counts), min(10, len(counts))),\n rwidth=0.8,\n )\n ax.set_xticks(np.arange(len(data_dict)) + 1)\n ax.set_xticklabels(list(data_dict.keys()))\n return ax, message", "test": "import numpy as np\nimport matplotlib.pyplot as plt\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_895.\"\"\"\n def test_uniform_distribution(self):\n \"\"\"Test whether the function correctly identifies a uniform distribution.\"\"\"\n data = {\"A\": 5, \"B\": 5, \"C\": 5}\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is uniform.\")\n def test_non_uniform_distribution(self):\n \"\"\"Test whether the function correctly identifies a non-uniform distribution.\"\"\"\n data = {\"A\": 3, \"B\": 2, \"C\": 4}\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is not uniform.\")\n def test_empty_dictionary(self):\n \"\"\"Test the function with an empty dictionary.\"\"\"\n data = {}\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is uniform.\")\n def test_single_category(self):\n \"\"\"Test the function with a single category.\"\"\"\n data = {\"A\": 1}\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is uniform.\")\n def test_large_distribution(self):\n \"\"\"Test the function with a large number of categories.\"\"\"\n data = {chr(i): i for i in range(65, 91)} # A to Z with ascending counts\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is not uniform.\")", "apis": ["numpy.arange", "numpy.linspace", "collections.Counter", "matplotlib.pyplot.subplots"], "libs": ["numpy", "collections", "matplotlib"], "doc": {"description": ["Analyze the uniformity of a distribution represented by a dictionary of categories and their counts,", "and create a description to introduce this distribution."], "note": ["If 'data_dict' is empty, the function returns None and a message \"The distribution is uniform.\"", "indicating that an empty distribution is considered uniform by default.", "If 'data_dict' is not empty, it calculates the average count of the categories.", "The distribution is considered uniform if the absolute difference between each count and the", "average count is less than or equal to 1e-5.", "If any count's absolute difference with the average count is more than 1e-5, the distribution", "is considered not uniform.", "The function then creates a histogram of the counts using matplotlib, with the number of bins", "being the lesser of 10 or the number of unique counts. The histogram's x-ticks are labeled with", "the category names."], "params": ["data_dict (dict): A dictionary with categories as keys and counts as values."], "returns": ["tuple: A tuple containing:", "matplotlib.axes._subplots.AxesSubplot: The axes object of the histogram.", "str: A message indicating whether the distribution is uniform (\"The distribution is uniform.\")", "or not (\"The distribution is not uniform.\")."], "reqs": ["collections", "numpy", "matplotlib"], "raises": [], "example": [">>> data = {'A': 2, 'B': 3, 'C': 4, 'D': 1, 'E': 2}", ">>> ax, message = f_895(data)", ">>> print(message)", "The distribution is not uniform."]}} -{"task_id": "f_809", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_809(df: pd.DataFrame) -> pd.DataFrame:\n \"\"\"\n Calculate the cumulative sum for each column in a given DataFrame and plot\n the results in a bar chart.\n\n Args:\n df (pd.DataFrame): The input DataFrame with numerical values.\n Must not be empty and must contain numeric data to plot.\n Returns:\n - tuple: A tuple containing:\n (1) A DataFrame with cumulative sums for each column.\n (2) A matplotlib bar chart Figure of these cumulative sums.\n\n Raises:\n - ValueError: If the DataFrame is empty or contains non-numeric data.\n\n Requirements:\n - pandas\n - matplotlib\n\n Note:\n - NaN values are ignored in the cumulative sum calculation, i.e. treated as\n zero for the purpose of the sum without changing existing values to NaN.\n - The plot title is set to 'Cumulative Sum per Column'.\n - X-axis label is 'Index' and Y-axis label is 'Cumulative Sum'.\n - A legend is included in the plot.\n\n Example:\n >>> input_df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n >>> output_df, fig = f_809(input_df)\n >>> output_df\n A B\n 0 1 4\n 1 3 9\n 2 6 15\n >>> fig\n
\n \"\"\"", "canonical_solution": " cumsum_df = df.cumsum()\n\n fig, ax = plt.subplots()\n cumsum_df.plot(kind=\"bar\", ax=ax)\n ax.set_title(\"Cumulative Sum per Column\")\n ax.set_xlabel(\"Index\")\n ax.set_ylabel(\"Cumulative Sum\")\n ax.legend()\n\n return cumsum_df, fig", "test": "import numpy as np\nimport pandas as pd\nimport unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Setup common for all tests\n self.input_df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n self.expected_df = pd.DataFrame({\"A\": [1, 3, 6], \"B\": [4, 9, 15]})\n def test_case_1(self):\n # Test basic case\n output_df, _ = f_809(self.input_df)\n pd.testing.assert_frame_equal(output_df, self.expected_df)\n def test_case_2(self):\n # Test cumulative sum correctness for a case with negative values\n input_df_neg = pd.DataFrame({\"A\": [1, -2, 3], \"B\": [-4, 5, -6]})\n expected_df_neg = pd.DataFrame({\"A\": [1, -1, 2], \"B\": [-4, 1, -5]})\n output_df_neg, _ = f_809(input_df_neg)\n pd.testing.assert_frame_equal(output_df_neg, expected_df_neg)\n def test_case_3(self):\n # Test bar chart properties\n _, fig = f_809(self.input_df)\n self.assertIsInstance(fig, plt.Figure)\n ax = fig.axes[0] # Get the Axes object from the figure\n # Verify the title, x-label, and y-label\n self.assertEqual(ax.get_title(), \"Cumulative Sum per Column\")\n self.assertEqual(ax.get_xlabel(), \"Index\")\n self.assertEqual(ax.get_ylabel(), \"Cumulative Sum\")\n # Ensure that a legend is present and contains the correct labels\n legend_labels = [text.get_text() for text in ax.get_legend().get_texts()]\n expected_labels = self.input_df.columns.tolist()\n self.assertEqual(legend_labels, expected_labels)\n def test_case_4(self):\n # Test with an empty DataFrame\n empty_df = pd.DataFrame()\n with self.assertRaises(Exception):\n f_809(empty_df)\n def test_case_5(self):\n # Test with DataFrame containing NaN values\n nan_df = pd.DataFrame({\"A\": [1, np.nan, 3], \"B\": [4, 5, np.nan]})\n nan_df_cumsum = nan_df.cumsum()\n output_nan_df, _ = f_809(nan_df)\n pd.testing.assert_frame_equal(output_nan_df, nan_df_cumsum)\n def test_case_6(self):\n # Test with DataFrame containing all zeros\n zeros_df = pd.DataFrame({\"A\": [0, 0, 0], \"B\": [0, 0, 0]})\n expected_zeros_df = pd.DataFrame({\"A\": [0, 0, 0], \"B\": [0, 0, 0]})\n output_zeros_df, _ = f_809(zeros_df)\n pd.testing.assert_frame_equal(output_zeros_df, expected_zeros_df)\n def test_case_7(self):\n # Test with a DataFrame containing only one row\n one_row_df = pd.DataFrame({\"A\": [1], \"B\": [2]})\n expected_one_row_df = pd.DataFrame({\"A\": [1], \"B\": [2]})\n output_one_row_df, _ = f_809(one_row_df)\n pd.testing.assert_frame_equal(output_one_row_df, expected_one_row_df)", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Calculate the cumulative sum for each column in a given DataFrame and plot", "the results in a bar chart.", "Args:", "df (pd.DataFrame): The input DataFrame with numerical values.", "Must not be empty and must contain numeric data to plot."], "note": ["NaN values are ignored in the cumulative sum calculation, i.e. treated as", "zero for the purpose of the sum without changing existing values to NaN.", "The plot title is set to 'Cumulative Sum per Column'.", "X-axis label is 'Index' and Y-axis label is 'Cumulative Sum'.", "A legend is included in the plot."], "params": [], "returns": ["tuple: A tuple containing:", "(1) A DataFrame with cumulative sums for each column.", "(2) A matplotlib bar chart Figure of these cumulative sums."], "reqs": ["pandas", "matplotlib"], "raises": ["ValueError: If the DataFrame is empty or contains non-numeric data."], "example": [">>> input_df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})", ">>> output_df, fig = f_809(input_df)", ">>> output_df", "A B", "0 1 4", "1 3 9", "2 6 15", ">>> fig", "
"]}} -{"task_id": "f_559", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.linear_model import LinearRegression\n\nROWS = 100\nCOLUMNS = ['X', 'Y']\n\ndef f_559(df):\n \"\"\"\n Given a Pandas DataFrame with random numeric values and columns X & Y, use sklearn's linear regression to match the data to a linear model.\n\n Parameters:\n - df (DataFrame): The DataFrame to use.\n\n Returns:\n - model (LinearRegression): The fitted linear model.\n\n Requirements:\n - numpy\n - pandas\n - sklearn\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.normal(size=(100, 2)), columns=['X', 'Y'])\n >>> model = f_559(df)\n >>> print(model)\n LinearRegression()\n \"\"\"", "canonical_solution": " X = pd.DataFrame(df[['X']]) # Extracting column 'X' as a DataFrame\n y = pd.Series(df['Y']) # Extracting column 'Y' as a Series\n \n # Fitting the linear regression model\n model = LinearRegression().fit(X, y)\n \n return model", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n \n def test_case_2(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n self.assertTrue(model.coef_ is not None)\n def test_case_3(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n self.assertTrue(model.coef_ is not None)\n self.assertTrue(model.intercept_ is not None)\n def test_case_4(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n self.assertTrue(model.coef_ is not None)\n self.assertTrue(model.intercept_ is not None)\n self.assertTrue(model.score(df[['X']], df['Y']) is not None)\n def test_case_5(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n self.assertTrue(model.coef_ is not None)\n self.assertTrue(model.intercept_ is not None)\n self.assertTrue(model.score(df[['X']], df['Y']) is not None)\n self.assertTrue(model.score(df[['X']], df['Y']) >= 0)", "apis": ["pandas.Series", "pandas.DataFrame", "sklearn.linear_model.LinearRegression"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Given a Pandas DataFrame with random numeric values and columns X & Y, use sklearn's linear regression to match the data to a linear model."], "note": [], "params": ["df (DataFrame): The DataFrame to use."], "returns": ["model (LinearRegression): The fitted linear model."], "reqs": ["numpy", "pandas", "sklearn"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.normal(size=(100, 2)), columns=['X', 'Y'])", ">>> model = f_559(df)", ">>> print(model)", "LinearRegression()"]}} -{"task_id": "f_843", "prompt": "import urllib.request\nimport os\nimport json\nimport pandas as pd\n\n# Constants\nTARGET_JSON_FILE = \"downloaded_file.json\"\n\n\ndef f_843(url):\n \"\"\"\n This function retrieves a JSON file from the given URL using urllib.request.urlretrieve,\n temporarily saving it as 'downloaded_file.json'. It then opens and reads this file,\n converts the JSON content into a pandas DataFrame, and finally deletes the temporary JSON file.\n\n Parameters:\n url (str): The URL of the JSON file to be downloaded.\n\n Returns:\n pandas.DataFrame: A DataFrame constructed from the JSON data in the downloaded file.\n\n Requirements:\n - urllib.request\n - os\n - json\n - pandas\n\n Example:\n >>> f_843('http://example.com/employees.json')\n name age city\n 0 Alice 25 New York\n 1 Bob 30 San Francisco\n \"\"\"", "canonical_solution": " urllib.request.urlretrieve(url, TARGET_JSON_FILE)\n\n with open(TARGET_JSON_FILE, \"r\") as f:\n data = json.load(f)\n\n os.remove(TARGET_JSON_FILE)\n\n return pd.DataFrame(data)", "test": "import unittest\nimport pandas as pd\nfrom unittest.mock import patch, mock_open\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_843 function.\"\"\"\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"os.remove\")\n def test_sample_1(self, mock_remove, mock_urlretrieve):\n \"\"\"Test that the function returns the correct DataFrame for a given JSON file.\"\"\"\n url = \"http://example.com/sample_1.json\"\n sample_data = '[{\"name\": \"Alice\", \"age\": 25, \"city\": \"New York\"}, {\"name\": \"Bob\", \"age\": 30, \"city\": \"San Francisco\"}]'\n mock_urlretrieve.return_value = None\n with patch(\"builtins.open\", mock_open(read_data=sample_data)):\n expected_df = pd.DataFrame(\n [\n {\"name\": \"Alice\", \"age\": 25, \"city\": \"New York\"},\n {\"name\": \"Bob\", \"age\": 30, \"city\": \"San Francisco\"},\n ]\n )\n result_df = f_843(url)\n pd.testing.assert_frame_equal(result_df, expected_df)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")\n mock_remove.assert_called_once_with(\"downloaded_file.json\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"os.remove\")\n def test_sample_2(self, mock_remove, mock_urlretrieve):\n \"\"\"Test that the function returns the correct DataFrame for a given JSON file.\"\"\"\n url = \"http://example.com/sample_2.json\"\n sample_data = '[{\"product\": \"Laptop\", \"price\": 1000}, {\"product\": \"Mouse\", \"price\": 20}, {\"product\": \"Keyboard\", \"price\": 50}]'\n mock_urlretrieve.return_value = None\n with patch(\"builtins.open\", mock_open(read_data=sample_data)):\n expected_df = pd.DataFrame(\n [\n {\"product\": \"Laptop\", \"price\": 1000},\n {\"product\": \"Mouse\", \"price\": 20},\n {\"product\": \"Keyboard\", \"price\": 50},\n ]\n )\n result_df = f_843(url)\n pd.testing.assert_frame_equal(result_df, expected_df)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")\n mock_remove.assert_called_once_with(\"downloaded_file.json\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"os.remove\")\n def test_empty_json(self, mock_remove, mock_urlretrieve):\n \"\"\"Test that the function returns an empty DataFrame for an empty JSON file.\"\"\"\n url = \"http://example.com/empty.json\"\n sample_data = \"[]\"\n mock_urlretrieve.return_value = None\n with patch(\"builtins.open\", mock_open(read_data=sample_data)):\n expected_df = pd.DataFrame()\n result_df = f_843(url)\n pd.testing.assert_frame_equal(result_df, expected_df)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")\n @patch(\"urllib.request.urlretrieve\")\n def test_invalid_url(self, mock_urlretrieve):\n \"\"\"Test that the function raises an exception when the URL is invalid.\"\"\"\n url = \"http://example.com/non_existent.json\"\n mock_urlretrieve.side_effect = Exception(\"URL retrieval failed\")\n with self.assertRaises(Exception):\n f_843(url)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"os.remove\")\n def test_invalid_json(self, mock_remove, mock_urlretrieve):\n \"\"\"Test that the function raises an exception when the JSON file is invalid.\"\"\"\n url = \"http://example.com/invalid.json\"\n sample_data = \"invalid json content\"\n mock_urlretrieve.return_value = None\n with patch(\n \"builtins.open\", mock_open(read_data=sample_data)\n ), self.assertRaises(Exception):\n f_843(url)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")", "apis": ["urllib.request.urlretrieve", "json.load", "urllib.request", "pandas.DataFrame", "os.remove"], "libs": ["urllib", "pandas", "json", "os"], "doc": {"description": ["This function retrieves a JSON file from the given URL using urllib.request.urlretrieve,", "temporarily saving it as 'downloaded_file.json'. It then opens and reads this file,", "converts the JSON content into a pandas DataFrame, and finally deletes the temporary JSON file."], "note": [], "params": ["url (str): The URL of the JSON file to be downloaded."], "returns": ["pandas.DataFrame: A DataFrame constructed from the JSON data in the downloaded file."], "reqs": ["urllib.request", "os", "json", "pandas"], "raises": [], "example": [">>> f_843('http://example.com/employees.json')", "name age city", "0 Alice 25 New York", "1 Bob 30 San Francisco"]}} -{"task_id": "f_852", "prompt": "import xml.etree.ElementTree as ET\nimport csv\n\n\ndef f_852(xml_content, output_csv_path):\n \"\"\"\n Parses XML content from a string and converts it into a CSV format.\n\n Parameters:\n - xml_content (str): A string containing the XML content to be parsed. It should\n be well-formed XML.\n - output_csv_path (str): The file path where the resulting CSV file will be saved.\n This path must be valid and accessible for writing.\n\n Returns:\n - None: The function does not return any value. Instead, it writes the output to\n a CSV file at the specified path.\n\n Raises:\n - ET.ParseError: This exception is raised if the input XML content is malformed or\n cannot be successfully parsed. The exception message includes\n details about the parsing error.\n - IOError: Raised if there is an issue with writing to the specified CSV file path.\n This can happen due to reasons like invalid file path, full disk space,\n lack of write permissions, etc. The exception message provides details\n about the IO error.\n\n\n Requirements:\n - xml\n - csv\n\n Example:\n >>> f_852('data', 'path/to/output.csv')\n >>> with open('path/to/output.csv', 'r') as f:\n ... print(f.read())\n element,data\n\n Note:\n - Ensure that the XML content passed to the function is well-formed.\n - The output CSV path should be a valid file path where the user has write\n permissions, to prevent IOError.\n \"\"\"", "canonical_solution": " try:\n root = ET.fromstring(xml_content)\n data = [[elem.tag, elem.text] for elem in root.iter()]\n\n with open(output_csv_path, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n writer = csv.writer(f)\n writer.writerows(data)\n except ET.ParseError as e:\n raise ET.ParseError(f\"Error parsing XML: {e}\") from e\n except IOError as e:\n raise IOError(f\"Error writing CSV file: {e}\") from e", "test": "import unittest\nimport xml.etree.ElementTree as ET\nimport csv\nimport shutil\nfrom pathlib import Path\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_852.\"\"\"\n test_data_dir = \"mnt/data/f_852_data_chien\"\n @classmethod\n def setUpClass(cls):\n \"\"\"Set up method to create a directory for test files.\"\"\"\n cls.test_dir = Path(cls.test_data_dir)\n cls.test_dir.mkdir(parents=True, exist_ok=True)\n def check_csv_content(self, xml_content, csv_path):\n \"\"\"Helper function to check if the CSV content matches the XML content.\"\"\"\n root = ET.fromstring(xml_content)\n expected_data = [\n [elem.tag, elem.text if elem.text is not None else \"\"]\n for elem in root.iter()\n ]\n with open(csv_path, \"r\", encoding=\"utf-8\") as file:\n reader = csv.reader(file)\n csv_data = list(reader)\n self.assertEqual(expected_data, csv_data)\n def test_simple_xml(self):\n \"\"\"Test with simple XML content.\"\"\"\n xml_content = \"data\"\n csv_output = self.test_dir / \"output_scenario_0.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_nested_xml(self):\n \"\"\"Test with nested XML content.\"\"\"\n xml_content = \"data\"\n csv_output = self.test_dir / \"output_scenario_1.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_empty_xml(self):\n \"\"\"Test with an empty XML.\"\"\"\n xml_content = \"\"\n csv_output = self.test_dir / \"output_scenario_2.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_xml_with_attributes(self):\n \"\"\"Test with an XML that contains elements with attributes.\"\"\"\n xml_content = 'data'\n csv_output = self.test_dir / \"output_scenario_3.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_large_xml(self):\n \"\"\"Test with a larger XML file.\"\"\"\n xml_content = (\n \"\"\n + \"\".join([f\"{i}\" for i in range(100)])\n + \"\"\n )\n csv_output = self.test_dir / \"output_scenario_4.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_invalid_xml_content(self):\n \"\"\"Test with invalid XML content to trigger ET.ParseError.\"\"\"\n xml_content = \"datadata\"\n csv_output = self.test_dir / \"non_existent_directory\" / \"output.csv\"\n with self.assertRaises(IOError):\n f_852(xml_content, csv_output)\n @classmethod\n def tearDownClass(cls):\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["xml.etree.ElementTree.ParseError", "csv.writer", "xml.etree.ElementTree.fromstring"], "libs": ["csv", "xml"], "doc": {"description": ["Parses XML content from a string and converts it into a CSV format."], "note": ["Ensure that the XML content passed to the function is well-formed.", "The output CSV path should be a valid file path where the user has write", "permissions, to prevent IOError."], "params": ["xml_content (str): A string containing the XML content to be parsed. It should", "be well-formed XML.", "output_csv_path (str): The file path where the resulting CSV file will be saved.", "This path must be valid and accessible for writing."], "returns": ["None: The function does not return any value. Instead, it writes the output to", "a CSV file at the specified path."], "reqs": ["xml", "csv"], "raises": ["ET.ParseError: This exception is raised if the input XML content is malformed or", "cannot be successfully parsed. The exception message includes", "details about the parsing error.", "IOError: Raised if there is an issue with writing to the specified CSV file path.", "This can happen due to reasons like invalid file path, full disk space,", "lack of write permissions, etc. The exception message provides details", "about the IO error."], "example": [">>> f_852('data', 'path/to/output.csv')", ">>> with open('path/to/output.csv', 'r') as f:", "... print(f.read())", "element,data"]}} -{"task_id": "f_930", "prompt": "import string\nimport random\nimport pandas as pd\nimport numpy as np\n\n# Constants\nNUM_SAMPLES = 1000 # Number of samples\n\n\ndef f_930():\n \"\"\"\n Generates a DataFrame with two columns: a string field and a float field.\n The string field contains randomly generated strings of 10 ASCII letters.\n The float field contains randomly generated numbers between 0 and 10000,\n formatted with two decimal places and a comma as the thousands separator.\n\n Parameters:\n - None\n\n Returns:\n DataFrame: A pandas DataFrame with NUM_SAMPLES rows. Each row contains a\n random string in the 'String Field' column and a formatted float in the\n 'Float Field' column.\n\n Requirements:\n - string\n - random\n - pandas\n - numpy\n\n Example:\n >>> random.seed(0)\n >>> np.random.seed(0)\n >>> dataset = f_930()\n >>> print(dataset.head(1))\n String Field Float Field\n 0 RNvnAvOpyE 5,488.14\n\n Note: The exact values in the dataset will vary as they are randomly generated.\n \"\"\"", "canonical_solution": " data = {\n \"String Field\": [\n \"\".join(random.choices(string.ascii_letters, k=10))\n for _ in range(NUM_SAMPLES)\n ],\n \"Float Field\": [f\"{x:,.2f}\" for x in np.random.uniform(0, 10000, NUM_SAMPLES)],\n }\n\n df = pd.DataFrame(data)\n\n return df", "test": "import unittest\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_930.\"\"\"\n def test_dataframe_creation(self):\n \"\"\"\n Test if the function returns a pandas DataFrame.\n \"\"\"\n random.seed(1)\n result = f_930()\n self.assertIsInstance(result, pd.DataFrame)\n def test_row_count(self):\n \"\"\"\n Test if the DataFrame contains the correct number of rows.\n \"\"\"\n random.seed(2)\n result = f_930()\n self.assertEqual(len(result), NUM_SAMPLES)\n def test_column_count(self):\n \"\"\"\n Test if the DataFrame contains exactly two columns.\n \"\"\"\n random.seed(3)\n result = f_930()\n self.assertEqual(len(result.columns), 2)\n def test_string_field_format(self):\n \"\"\"\n Test if the 'String Field' contains strings of 10 ASCII letters.\n \"\"\"\n random.seed(4)\n result = f_930()\n all_strings = all(result[\"String Field\"].str.match(\"^[A-Za-z]{10}$\"))\n self.assertTrue(all_strings)\n def test_float_field_format(self):\n \"\"\"\n Test if the 'Float Field' contains formatted float strings.\n \"\"\"\n random.seed(5)\n result = f_930()\n all_floats = all(\n isinstance(float(val.replace(\",\", \"\")), float)\n for val in result[\"Float Field\"]\n )\n self.assertTrue(all_floats)", "apis": ["numpy.random", "numpy.random.uniform", "string.ascii_letters", "random.choices", "pandas.DataFrame"], "libs": ["numpy", "pandas", "random", "string"], "doc": {"description": ["Generates a DataFrame with two columns: a string field and a float field.", "The string field contains randomly generated strings of 10 ASCII letters.", "The float field contains randomly generated numbers between 0 and 10000,", "formatted with two decimal places and a comma as the thousands separator."], "note": ["The exact values in the dataset will vary as they are randomly generated."], "params": ["None"], "returns": ["DataFrame: A pandas DataFrame with NUM_SAMPLES rows. Each row contains a", "random string in the 'String Field' column and a formatted float in the", "'Float Field' column."], "reqs": ["string", "random", "pandas", "numpy"], "raises": [], "example": [">>> random.seed(0)", ">>> np.random.seed(0)", ">>> dataset = f_930()", ">>> print(dataset.head(1))", "String Field Float Field", "0 RNvnAvOpyE 5,488.14"]}} -{"task_id": "f_758", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\ndef f_758(df: pd.DataFrame) -> tuple:\n \"\"\"\n Visualize the distribution of stock closing prices using both a box plot and a histogram\n within a single figure. This function is designed to help understand the spread, central tendency,\n and the distribution shape of stock closing prices.\n\n Note:\n The tile of the box plot is set to 'Box Plot of Closing Prices' and the title of the histogram is set to 'Histogram of Closing Prices'.\n \n Requirements:\n - pandas\n - matplotlib.pyplot\n - seaborn\n\n Parameters:\n df (DataFrame): A pandas DataFrame containing at least one column named 'closing_price'\n with stock closing prices.\n\n Returns:\n tuple: A tuple containing two matplotlib.axes._subplots.AxesSubplot objects: the first for the boxplot\n and the second for the histogram.\n\n Example:\n >>> df = pd.DataFrame({\n ... 'closing_price': [100, 101, 102, 103, 104, 150]\n ... })\n >>> boxplot_ax, histplot_ax = f_758(df)\n >>> print(boxplot_ax.get_title())\n Box Plot of Closing Prices\n >>> print(histplot_ax.get_title())\n Histogram of Closing Prices\n \"\"\"", "canonical_solution": " fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n \n boxplot_ax = sns.boxplot(x=df['closing_price'], ax=axes[0])\n boxplot_ax.set_title('Box Plot of Closing Prices')\n \n histplot_ax = sns.histplot(df['closing_price'], kde=True, ax=axes[1])\n histplot_ax.set_title('Histogram of Closing Prices')\n \n plt.tight_layout()\n plt.close(fig) # Prevent automatic figure display within Jupyter notebooks or interactive environments.\n \n return boxplot_ax, histplot_ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n# Assuming the function f_758 is defined in the same script, otherwise import it appropriately.\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n df = pd.DataFrame({\n 'closing_price': [100, 101, 102, 103, 104, 150]\n })\n boxplot_ax, histplot_ax = f_758(df)\n \n self.assertIsInstance(boxplot_ax, plt.Axes)\n self.assertIsInstance(histplot_ax, plt.Axes)\n \n self.assertEqual(boxplot_ax.get_title(), 'Box Plot of Closing Prices')\n self.assertEqual(histplot_ax.get_title(), 'Histogram of Closing Prices')\n \n self.assertEqual(histplot_ax.get_xlabel(), 'closing_price')\n self.assertIn('Count', histplot_ax.get_ylabel()) # Check if 'Count' is part of the ylabel\n \n def test_empty_df(self):\n df = pd.DataFrame({'closing_price': []})\n boxplot_ax, histplot_ax = f_758(df)\n \n self.assertIsInstance(boxplot_ax, plt.Axes)\n self.assertIsInstance(histplot_ax, plt.Axes)\n # Instead of checking if the plot \"has data,\" we ensure that it exists and does not raise an error.\n self.assertIsNotNone(boxplot_ax, \"Boxplot should be created even with empty data.\")\n self.assertIsNotNone(histplot_ax, \"Histogram should be created even with empty data.\")\n def test_invalid_column(self):\n df = pd.DataFrame({'price': [100, 101, 102]})\n with self.assertRaises(KeyError):\n f_758(df)\n def test_single_value_df(self):\n df = pd.DataFrame({'closing_price': [100]})\n boxplot_ax, histplot_ax = f_758(df)\n \n self.assertIsInstance(boxplot_ax, plt.Axes)\n self.assertIsInstance(histplot_ax, plt.Axes)\n self.assertTrue(boxplot_ax.has_data(), \"Boxplot should handle a single value dataframe.\")\n self.assertTrue(histplot_ax.has_data(), \"Histogram should handle a single value dataframe.\")\n def test_large_values_df(self):\n df = pd.DataFrame({'closing_price': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]})\n boxplot_ax, histplot_ax = f_758(df)\n \n self.assertIsInstance(boxplot_ax, plt.Axes)\n self.assertIsInstance(histplot_ax, plt.Axes)\n self.assertTrue(boxplot_ax.has_data(), \"Boxplot should handle large values.\")\n self.assertTrue(histplot_ax.has_data(), \"Histogram should handle large values.\")", "apis": ["matplotlib.pyplot.tight_layout", "seaborn.boxplot", "seaborn.histplot", "matplotlib.pyplot.close", "pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas", "seaborn"], "doc": {"description": ["Visualize the distribution of stock closing prices using both a box plot and a histogram", "within a single figure. This function is designed to help understand the spread, central tendency,", "and the distribution shape of stock closing prices."], "note": ["The tile of the box plot is set to 'Box Plot of Closing Prices' and the title of the histogram is set to 'Histogram of Closing Prices'."], "params": ["df (DataFrame): A pandas DataFrame containing at least one column named 'closing_price'", "with stock closing prices."], "returns": ["tuple: A tuple containing two matplotlib.axes._subplots.AxesSubplot objects: the first for the boxplot", "and the second for the histogram."], "reqs": ["pandas", "matplotlib.pyplot", "seaborn"], "raises": [], "example": [">>> df = pd.DataFrame({", "... 'closing_price': [100, 101, 102, 103, 104, 150]", "... })", ">>> boxplot_ax, histplot_ax = f_758(df)", ">>> print(boxplot_ax.get_title())", "Box Plot of Closing Prices", ">>> print(histplot_ax.get_title())", "Histogram of Closing Prices"]}} -{"task_id": "f_737", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n# Constants\nARRAY_SIZE = 10000\n\ndef f_737():\n \"\"\"\n Create a numeric array of random integers, calculate the mean and standard deviation, and draw a histogram of the distribution.\n\n Returns:\n Tuple: A tuple containing the array, mean, standard deviation, and the histogram plot (Axes).\n\n Note:\n The random integers are generated between 1 and 100. The title of the histogram is \"Histogram of Random Values\". \n The x-axis is labeled \"Val\" and the y-axis is labeled \"Freq\". \n The mean is plotted as a red dashed line, and the standard deviation is plotted as purple dashed lines.\n \n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> import numpy as np\n >>> np.random.seed(0)\n >>> array, mean, std, ax = f_737()\n >>> print(mean, std)\n 250.7154 142.85617453522966\n >>> plt.show()\n \"\"\"", "canonical_solution": " array = np.random.randint(1, 500, size=ARRAY_SIZE)\n mean = np.mean(array)\n std = np.std(array)\n\n fig, ax = plt.subplots()\n ax.hist(array, bins='auto')\n ax.set_title('Histogram of Random Values')\n ax.set_xlabel('Val')\n ax.set_ylabel('Freq')\n return array, mean, std, ax", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n np.random.seed(0)\n array, mean, std, ax = f_737()\n self.assertEqual(array.size, ARRAY_SIZE)\n self.assertEqual(mean, 250.7154)\n self.assertEqual(std, 142.85617453522966)\n self.assertEqual(ax.get_title(), 'Histogram of Random Values')\n def test_case_2(self):\n array, mean, std, ax = f_737()\n self.assertEqual(ax.get_xlabel(), 'Val')\n self.assertEqual(ax.get_ylabel(), 'Freq')\n def test_case_3(self):\n np.random.seed(42)\n array, mean, std, ax = f_737()\n self.assertEqual(array[0], 103)\n self.assertEqual(array[-1], 474)\n self.assertEqual(mean, 250.171)\n self.assertEqual(std, 144.01374920124815)\n \n def test_case_4(self):\n np.random.seed(142)\n array, mean, std, ax = f_737()\n self.assertEqual(array[0], 278)\n self.assertEqual(array[-1], 113)\n self.assertEqual(mean, 251.1245)\n self.assertEqual(std, 144.49066405740547)\n def test_case_5(self):\n np.random.seed(250)\n array, mean, std, ax = f_737()\n self.assertEqual(array[0], 367)\n self.assertEqual(array[-1], 190)\n self.assertEqual(mean, 249.037)\n self.assertEqual(std, 144.32681882103546)", "apis": ["numpy.random.randint", "numpy.std", "numpy.mean", "numpy.random", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Create a numeric array of random integers, calculate the mean and standard deviation, and draw a histogram of the distribution."], "note": ["The random integers are generated between 1 and 100. The title of the histogram is \"Histogram of Random Values\".", "The x-axis is labeled \"Val\" and the y-axis is labeled \"Freq\".", "The mean is plotted as a red dashed line, and the standard deviation is plotted as purple dashed lines."], "params": [], "returns": ["Tuple: A tuple containing the array, mean, standard deviation, and the histogram plot (Axes)."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> import numpy as np", ">>> np.random.seed(0)", ">>> array, mean, std, ax = f_737()", ">>> print(mean, std)", "250.7154 142.85617453522966", ">>> plt.show()"]}} -{"task_id": "f_364", "prompt": "import pandas as pd\nimport random\nimport matplotlib.pyplot as plt\n\n\ndef f_364(num_rows=100, categories=[\"a\", \"b\", \"c\", \"d\", \"e\"], random_seed=42):\n \"\"\"\n Create a Pandas DataFrame with specified number of rows. Each row contains a randomly\n selected category from the provided categories list and a random integer between 1 and 100.\n\n The function also generates a bar chart visualizing the counts of each category in the DataFrame\n and returns both the DataFrame and the bar chart.\n\n Parameters:\n - num_rows (int): Number of rows in the DataFrame. Default is 100. Must be at least 1.\n - categories (list): List of categories to choose from. Default is ['a', 'b', 'c', 'd', 'e'].\n - random_seed (int): Seed for random number generation to ensure reproducibility. Default is 42.\n\n Returns:\n - pd.DataFrame: A pandas DataFrame with randomly generated category data.\n - plt.Axes: A bar chart visualizing the category counts.\n\n Requirements:\n - pandas\n - random\n\n Example:\n >>> df, ax = f_364(num_rows=5)\n >>> df\n Category Value\n 0 a 18\n 1 a 95\n 2 c 14\n 3 b 87\n 4 b 95\n \"\"\"", "canonical_solution": " if num_rows <= 0:\n raise ValueError(\"num_rows must not be negative\")\n\n random.seed(random_seed)\n\n df = pd.DataFrame(\n {\n \"Category\": [\n categories[random.randint(0, len(categories) - 1)]\n for _ in range(num_rows)\n ],\n \"Value\": [random.randint(1, 100) for _ in range(num_rows)],\n }\n )\n\n ax = (\n df[\"Category\"]\n .value_counts()\n .plot(kind=\"bar\", title=\"Category Counts\", figsize=(10, 6))\n )\n\n return df, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with default parameters\n df, ax = f_364()\n self.assertEqual(len(df), 100)\n self.assertTrue(\n set(df[\"Category\"].unique()).issubset(set([\"a\", \"b\", \"c\", \"d\", \"e\"]))\n )\n self.assertTrue(df[\"Value\"].min() >= 1)\n self.assertTrue(df[\"Value\"].max() <= 100)\n self.assertEqual(ax.get_title(), \"Category Counts\")\n def test_case_2(self):\n # Test num_rows\n for num_rows in [10, 50, 100]:\n df, _ = f_364(num_rows=num_rows)\n self.assertEqual(len(df), num_rows)\n def test_case_3(self):\n # Test edge case - 0 rows\n with self.assertRaises(Exception):\n f_364(num_rows=0)\n def test_case_4(self):\n # Test edge case - invalid num_rows\n with self.assertRaises(Exception):\n f_364(num_rows=-1)\n def test_case_5(self):\n # Test categories\n df, _ = f_364(categories=[\"x\", \"y\", \"z\"])\n self.assertTrue(set(df[\"Category\"].unique()).issubset(set([\"x\", \"y\", \"z\"])))\n def test_case_6(self):\n # Test edge case - single category\n df, _ = f_364(categories=[\"unique\"])\n self.assertTrue(\n set([\"unique\"]).issubset(df[\"Category\"].unique()),\n \"Should work with a single category\",\n )\n def test_case_7(self):\n # Test edge case - empty categories\n with self.assertRaises(Exception):\n f_364(categories=[])\n def test_case_8(self):\n # Test random seed\n df1, _ = f_364(random_seed=123)\n df2, _ = f_364(random_seed=123)\n df3, _ = f_364(random_seed=124)\n self.assertTrue(\n df1.equals(df2), \"DataFrames should be identical with the same seed\"\n )\n self.assertFalse(\n df1.equals(df3), \"DataFrames should differ with different seeds\"\n )\n def test_case_9(self):\n # Test visualization\n categories = [\"x\", \"y\", \"z\"]\n _, ax = f_364(num_rows=100, categories=categories, random_seed=42)\n ax_categories = [tick.get_text() for tick in ax.get_xticklabels()]\n self.assertListEqual(\n sorted(categories),\n sorted(ax_categories),\n \"X-axis categories should match input categories\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["random.randint", "pandas.DataFrame", "random.seed"], "libs": ["pandas", "random"], "doc": {"description": ["Create a Pandas DataFrame with specified number of rows. Each row contains a randomly", "selected category from the provided categories list and a random integer between 1 and 100.", "The function also generates a bar chart visualizing the counts of each category in the DataFrame", "and returns both the DataFrame and the bar chart."], "note": [], "params": ["num_rows (int): Number of rows in the DataFrame. Default is 100. Must be at least 1.", "categories (list): List of categories to choose from. Default is ['a', 'b', 'c', 'd', 'e'].", "random_seed (int): Seed for random number generation to ensure reproducibility. Default is 42."], "returns": ["pd.DataFrame: A pandas DataFrame with randomly generated category data.", "plt.Axes: A bar chart visualizing the category counts."], "reqs": ["pandas", "random"], "raises": [], "example": [">>> df, ax = f_364(num_rows=5)", ">>> df", "Category Value", "0 a 18", "1 a 95", "2 c 14", "3 b 87", "4 b 95"]}} -{"task_id": "f_896", "prompt": "import pandas as pd\nfrom sklearn.feature_extraction.text import CountVectorizer\nimport matplotlib.pyplot as plt\n\n# Constants\nSTOP_WORDS = [\"a\", \"an\", \"the\", \"in\", \"on\", \"at\", \"and\", \"or\"]\n\n\ndef f_896(file_path, save_path=None):\n \"\"\"\n This function processes a text dataset from a CSV file, performs text vectorization while excluding specific\n stopwords, and creates a histogram of the ten most common words. The function is robust to different input\n scenarios, such as empty data or data containing only stopwords.\n\n Parameters:\n - file_path (str): Path to the CSV file containing the text data. The CSV should have a single text column.\n - save_path (str, optional): Path where the histogram plot will be saved. If not provided, the plot is displayed.\n\n Returns:\n - matplotlib Axes object: If save_path is not provided and valid words are found in the input, the function\n displays the histogram plot and returns the matplotlib Axes object.\n - None: In two scenarios:\n 1. If save_path is provided, saves the plot to the specified location and returns None.\n 2. If the input file is empty or contains only stop words, prints a message and returns None.\n\n Requirements:\n - pandas\n - scikit-learn\n - matplotlib\n\n Examples:\n >>> ax = f_896('text_data.csv')\n # ax is the matplotlib Axes object for the plot\n >>> result = f_896('text_data.csv', 'output_plot.png')\n # result is None, and the plot is saved to 'output_plot.png'\n \"\"\"", "canonical_solution": " df = pd.read_csv(file_path, header=None, names=[\"Text\"])\n df[\"Text\"] = df[\"Text\"].str.split(\"\\\\n\").str.join(\" \")\n\n vectorizer = CountVectorizer(stop_words=STOP_WORDS)\n try:\n word_count = vectorizer.fit_transform(df[\"Text\"])\n except ValueError:\n # Handle the case where the DataFrame is empty or contains only stop words\n print(\"No valid words to plot. Returning None.\")\n return None\n\n sum_words = word_count.sum(axis=0)\n words_freq = [\n (word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()\n ]\n words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)\n\n top_words = words_freq[:10]\n df = pd.DataFrame(top_words, columns=[\"Word\", \"Count\"])\n\n ax = df.plot.bar(x=\"Word\", y=\"Count\", rot=0)\n\n # Saving or displaying the plot\n if save_path:\n plt.savefig(save_path)\n plt.close()\n return None\n else:\n return ax", "test": "import unittest\nfrom unittest.mock import patch\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_896\"\"\"\n @patch(\"pandas.read_csv\")\n def test_empty_csv(self, mock_read_csv):\n \"\"\"\n Test with an empty CSV file. Checks if the function handles empty data gracefully.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame(columns=[\"Text\"])\n result = f_896(\"dummy_path.csv\")\n self.assertIsNone(result, \"The function should return None for empty data\")\n @patch(\"pandas.read_csv\")\n def test_single_line_csv(self, mock_read_csv):\n \"\"\"\n Test with a CSV file containing a single line of text. Verifies correct handling of minimal data.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"Text\": [\"test\"]})\n ax = f_896(\"dummy_path.csv\")\n self.assertEqual(\n len(ax.patches),\n 1,\n \"There should be one bar in the histogram for a single word\",\n )\n @patch(\"pandas.read_csv\")\n def test_stop_words_removal(self, mock_read_csv):\n \"\"\"\n Test to ensure that stop words are correctly removed from the text.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"Text\": [\"a test\"]})\n ax = f_896(\"dummy_path.csv\")\n x_labels = [label.get_text() for label in ax.get_xticklabels()]\n self.assertNotIn(\"a\", x_labels, \"Stop words should not appear in the histogram\")\n @patch(\"pandas.read_csv\")\n @patch(\"matplotlib.pyplot.savefig\")\n def test_save_plot(self, mock_savefig, mock_read_csv):\n \"\"\"\n Test the functionality of saving the plot to a file.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"Text\": [\"save test\"]})\n f_896(\"dummy_path.csv\", \"output.png\")\n mock_savefig.assert_called_with(\"output.png\")\n @patch(\"pandas.read_csv\")\n def test_multiple_lines_csv(self, mock_read_csv):\n \"\"\"\n Test with a CSV file containing multiple lines of text. Checks for correct handling of multiline data.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"Text\": [\"test1\", \"test2\"]})\n ax = f_896(\"dummy_path.csv\")\n self.assertEqual(\n len(ax.patches),\n 2,\n \"There should be two bars in the histogram for two different words\",\n )\n def tearDown(self):\n plt.close()", "apis": ["pandas.read_csv", "matplotlib.pyplot.close", "sklearn.feature_extraction.text.CountVectorizer", "matplotlib.pyplot.savefig", "pandas.DataFrame"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["This function processes a text dataset from a CSV file, performs text vectorization while excluding specific", "stopwords, and creates a histogram of the ten most common words. The function is robust to different input", "scenarios, such as empty data or data containing only stopwords."], "note": [], "params": ["file_path (str): Path to the CSV file containing the text data. The CSV should have a single text column.", "save_path (str, optional): Path where the histogram plot will be saved. If not provided, the plot is displayed."], "returns": ["matplotlib Axes object: If save_path is not provided and valid words are found in the input, the function", "displays the histogram plot and returns the matplotlib Axes object.", "None: In two scenarios:", "1. If save_path is provided, saves the plot to the specified location and returns None.", "2. If the input file is empty or contains only stop words, prints a message and returns None."], "reqs": ["pandas", "scikit-learn", "matplotlib"], "raises": [], "example": ["Examples:", ">>> ax = f_896('text_data.csv')", "# ax is the matplotlib Axes object for the plot", ">>> result = f_896('text_data.csv', 'output_plot.png')", "# result is None, and the plot is saved to 'output_plot.png'"]}} -{"task_id": "f_369", "prompt": "import matplotlib.pyplot as plt\nimport numpy as np\n\n\ndef f_369(myList):\n \"\"\"\n Draws a histogram of the values in a list and returns the plot's Axes.\n\n For visualization:\n - Bin edges are adjusted to align with integer values in `myList`.\n - Histogram bars are outlined in black.\n - X-axis label: 'Value'\n - Y-axis label: 'Frequency'\n - Plot title: 'Histogram of Values'\n\n Parameters:\n - myList (list): List of numerical values to plot.\n\n Returns:\n - ax (matplotlib.axes._axes.Axes): Axes object of the histogram plot.\n\n Requirements:\n - matplotlib.pyplot\n - numpy\n\n Example:\n >>> myList = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]\n >>> ax = f_369(myList)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0.0, 0, '0.0'), Text(0.5, 0, '0.5'), Text(1.0, 0, '1.0'), Text(1.5, 0, '1.5'), Text(2.0, 0, '2.0'), Text(2.5, 0, '2.5'), Text(3.0, 0, '3.0'), Text(3.5, 0, '3.5'), Text(4.0, 0, '4.0'), Text(4.5, 0, '4.5'), Text(5.0, 0, '5.0')]\n \"\"\"", "canonical_solution": " _, ax = plt.subplots()\n ax.hist(\n myList, bins=np.arange(min(myList), max(myList) + 2) - 0.5, edgecolor=\"black\"\n )\n ax.set_xlabel(\"Value\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(\"Histogram of Values\")\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n myList = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]\n ax = f_369(myList)\n heights, _, _ = ax.hist(\n myList,\n bins=np.arange(min(myList), max(myList) + 2) - 0.5,\n edgecolor=\"black\",\n )\n self.assertIsInstance(ax, plt.Axes)\n self.assertListEqual(list(heights), [1, 2, 3, 4])\n self.assertEqual(ax.get_title(), \"Histogram of Values\")\n self.assertEqual(ax.get_xlabel(), \"Value\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_2(self):\n # Test with empty list\n with self.assertRaises(ValueError):\n f_369([])\n def test_case_3(self):\n # Test with single element\n myList = [100]\n ax = f_369(myList)\n heights, _, _ = ax.hist(myList)\n self.assertEqual(heights.max(), 1)\n def test_case_4(self):\n # Test with negative values\n myList = [-5, -4, -3, -3, -2, -2, -2, -1]\n ax = f_369(myList)\n heights, _, _ = ax.hist(myList)\n self.assertGreaterEqual(len(heights), 1)\n def test_case_5(self):\n # Test with floats\n myList = [1.1, 1.2, 2.5, 2.5, 3.75, 4.25]\n ax = f_369(myList)\n heights, _, _ = ax.hist(myList)\n self.assertGreaterEqual(len(heights), 1)\n def test_case_6(self):\n # Test handling non-numeric values\n myList = [\"a\", \"b\", \"c\"]\n with self.assertRaises(TypeError):\n f_369(myList)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Draws a histogram of the values in a list and returns the plot's Axes.", "For visualization:", "- Bin edges are adjusted to align with integer values in `myList`.", "- Histogram bars are outlined in black.", "- X-axis label: 'Value'", "- Y-axis label: 'Frequency'", "- Plot title: 'Histogram of Values'"], "note": [], "params": ["myList (list): List of numerical values to plot."], "returns": ["ax (matplotlib.axes._axes.Axes): Axes object of the histogram plot."], "reqs": ["matplotlib.pyplot", "numpy"], "raises": [], "example": [">>> myList = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]", ">>> ax = f_369(myList)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0.0, 0, '0.0'), Text(0.5, 0, '0.5'), Text(1.0, 0, '1.0'), Text(1.5, 0, '1.5'), Text(2.0, 0, '2.0'), Text(2.5, 0, '2.5'), Text(3.0, 0, '3.0'), Text(3.5, 0, '3.5'), Text(4.0, 0, '4.0'), Text(4.5, 0, '4.5'), Text(5.0, 0, '5.0')]"]}} -{"task_id": "f_887", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Constants\nCATEGORIES = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n\n\ndef f_887(data_list):\n \"\"\"\n Processes a list of category labels to create a histogram that visualizes their distribution.\n This histogram compares the distribution of a predefined set of categories (A, B, C, D, E)\n with any additional categories found in the input list.\n\n Parameters:\n - data_list (list): A list containing category labels (strings).\n\n Returns:\n - Axes object (matplotlib.axes._subplots.AxesSubplot): The histogram displaying the distribution of categories.\n\n Requirements:\n - pandas\n - matplotlib\n\n Notes:\n - The function evaluates the distribution of predefined categories ('A', 'B', 'C', 'D', 'E') and checks for uniformity.\n - Categories in the data_list that are not among the predefined categories are identified and included in the histogram.\n - The ax.bar call in the function creates a bar plot on the axes object. It uses the following parameters:\n * all_categories: The categories to be displayed on the x-axis, including both predefined and extra categories.\n * category_counts.reindex(all_categories, fill_value=0): The counts of each category, where categories not found\n in the data_list are assigned a count of 0.\n * width=0.8: Sets the width of the bars in the bar plot.\n * align=\"center\": Aligns the bars with the center of the x-ticks.\n\n Raises:\n - ValueError: If the input data_list is empty, the function raises a ValueError with the message \"The data list is empty.\"\n In this case, no histogram is generated and the function terminates.\n\n\n Example:\n >>> data = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n >>> ax = f_887(data)\n >>> ax.get_xticks()\n array([0., 1., 2., 3., 4., 5., 6.])\n \"\"\"", "canonical_solution": "\n if not data_list:\n raise ValueError(\"The data list is empty.\")\n\n data_series = pd.Series(data_list)\n category_counts = data_series.value_counts()\n\n # Prepare data for predefined categories\n predefined_counts = category_counts.reindex(CATEGORIES, fill_value=0)\n\n # Check for uniformity in predefined categories\n if not all(x == predefined_counts.iloc[0] for x in predefined_counts):\n print(\"The distribution of predefined categories is not uniform.\")\n\n # Handling extra categories not in predefined list\n extra_categories = category_counts.drop(CATEGORIES, errors=\"ignore\").index.tolist()\n all_categories = CATEGORIES + extra_categories\n\n _, ax = plt.subplots()\n ax.bar(\n all_categories,\n category_counts.reindex(all_categories, fill_value=0),\n width=0.8,\n align=\"center\",\n )\n ax.set_xticks(all_categories)\n\n return ax", "test": "import unittest\nfrom unittest.mock import patch\nimport io\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function.\"\"\"\n def test_empty_list(self):\n \"\"\"\n Test the function with an empty list. Expects ValueError.\n \"\"\"\n with self.assertRaises(ValueError):\n f_887([])\n def test_uniform_distribution(self):\n \"\"\"\n Test the function with a uniform distribution of predefined categories.\n Expects no printed warning about non-uniform distribution.\n \"\"\"\n data = [\"A\", \"B\", \"C\", \"D\", \"E\"] * 2\n with patch(\"sys.stdout\", new=io.StringIO()) as fake_output:\n f_887(data)\n self.assertNotIn(\n \"The distribution of predefined categories is not uniform.\",\n fake_output.getvalue(),\n )\n def test_non_uniform_distribution(self):\n \"\"\"\n Test the function with a non-uniform distribution of predefined categories.\n Expects a printed warning about non-uniform distribution.\n \"\"\"\n data = [\"A\", \"A\", \"B\", \"C\", \"D\", \"E\"]\n with patch(\"sys.stdout\", new=io.StringIO()) as fake_output:\n f_887(data)\n self.assertIn(\n \"The distribution of predefined categories is not uniform.\",\n fake_output.getvalue(),\n )\n def test_extra_categories(self):\n \"\"\"\n Test the function with extra categories not in the predefined list.\n Expects extra categories to be included in the histogram.\n \"\"\"\n data = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\", \"G\"]\n ax = f_887(data)\n self.assertIn(\"F\", [tick.get_text() for tick in ax.get_xticklabels()])\n self.assertIn(\"G\", [tick.get_text() for tick in ax.get_xticklabels()])\n def test_no_extra_categories(self):\n \"\"\"\n Test the function with no extra categories.\n Expects only predefined categories to be included in the histogram.\n \"\"\"\n data = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n ax = f_887(data)\n for extra_cat in [\"F\", \"G\"]:\n self.assertNotIn(\n extra_cat, [tick.get_text() for tick in ax.get_xticklabels()]\n )\n def tearDown(self):\n plt.clf()", "apis": ["pandas.Series", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas"], "doc": {"description": ["Processes a list of category labels to create a histogram that visualizes their distribution.", "This histogram compares the distribution of a predefined set of categories (A, B, C, D, E)", "with any additional categories found in the input list.", "Notes:", "- The function evaluates the distribution of predefined categories ('A', 'B', 'C', 'D', 'E') and checks for uniformity.", "- Categories in the data_list that are not among the predefined categories are identified and included in the histogram.", "- The ax.bar call in the function creates a bar plot on the axes object. It uses the following parameters:", "* all_categories: The categories to be displayed on the x-axis, including both predefined and extra categories.", "* category_counts.reindex(all_categories, fill_value=0): The counts of each category, where categories not found", "in the data_list are assigned a count of 0.", "* width=0.8: Sets the width of the bars in the bar plot.", "* align=\"center\": Aligns the bars with the center of the x-ticks."], "note": [], "params": ["data_list (list): A list containing category labels (strings)."], "returns": ["Axes object (matplotlib.axes._subplots.AxesSubplot): The histogram displaying the distribution of categories."], "reqs": ["pandas", "matplotlib"], "raises": ["ValueError: If the input data_list is empty, the function raises a ValueError with the message \"The data list is empty.\"", "In this case, no histogram is generated and the function terminates."], "example": [">>> data = ['A', 'B', 'C', 'D', 'E', 'F', 'G']", ">>> ax = f_887(data)", ">>> ax.get_xticks()", "array([0., 1., 2., 3., 4., 5., 6.])"]}} -{"task_id": "f_789", "prompt": "import numpy as np\nimport pandas as pd\nimport random\n\ndef f_789(rows=3, cols=2, min_val=0, max_val=100, seed=0):\n \"\"\"\n Creates a matrix of specified dimensions with random integers within a given range,\n and then converts it into a pandas DataFrame.\n \n Parameters:\n - rows (int): Number of rows in the matrix. Default is 3.\n - cols (int): Number of columns in the matrix. Default is 2.\n - min_val (int): Minimum integer value for the random integers. Default is 0.\n - max_val (int): Maximum integer value for the random integers. Default is 100.\n \n Returns:\n DataFrame: A pandas DataFrame containing random integers within the specified range.\n \n Requirements:\n - numpy\n - pandas\n - random\n\n Example:\n >>> df = f_789(3, 2, 0, 100)\n >>> print(type(df))\n \n >>> print(df.shape)\n (3, 2)\n \"\"\"", "canonical_solution": " random.seed(seed)\n if min_val == max_val:\n matrix = np.full((rows, cols), min_val)\n else:\n matrix = np.array([[random.randrange(min_val, max_val) for j in range(cols)] for i in range(rows)])\n df = pd.DataFrame(matrix)\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = f_789()\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.iloc[:, 0].tolist(), [49, 53, 33])\n self.assertEqual(df.iloc[:, 1].tolist(), [97, 5, 65])\n \n def test_case_2(self):\n df = f_789(rows=5, cols=4)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.iloc[:, 0].tolist(), [49, 33, 38, 27, 17])\n self.assertEqual(df.iloc[:, 1].tolist(), [97, 65, 61, 64, 96])\n self.assertEqual(df.iloc[:, 2].tolist(), [53, 62, 45, 17, 12])\n def test_case_3(self):\n df = f_789(min_val=10, max_val=20)\n self.assertEqual(df.iloc[:, 0].tolist(), [16, 10, 18])\n self.assertEqual(df.iloc[:, 1].tolist(), [16, 14, 17])\n \n def test_case_4(self):\n df = f_789(min_val=50, max_val=50)\n self.assertEqual(df.iloc[:, 0].tolist(), [50, 50, 50])\n self.assertEqual(df.iloc[:, 1].tolist(), [50, 50, 50])\n def test_case_5(self):\n df = f_789(rows=0, cols=2)\n self.assertTrue(df.empty)", "apis": ["random.seed", "numpy.full", "random.randrange", "numpy.array", "pandas.DataFrame"], "libs": ["numpy", "pandas", "random"], "doc": {"description": ["Creates a matrix of specified dimensions with random integers within a given range,", "and then converts it into a pandas DataFrame."], "note": [], "params": ["rows (int): Number of rows in the matrix. Default is 3.", "cols (int): Number of columns in the matrix. Default is 2.", "min_val (int): Minimum integer value for the random integers. Default is 0.", "max_val (int): Maximum integer value for the random integers. Default is 100."], "returns": ["DataFrame: A pandas DataFrame containing random integers within the specified range."], "reqs": ["numpy", "pandas", "random"], "raises": [], "example": [">>> df = f_789(3, 2, 0, 100)", ">>> print(type(df))", "", ">>> print(df.shape)", "(3, 2)"]}} -{"task_id": "f_533", "prompt": "import csv\nimport sys\n\ndef f_533(filename):\n \"\"\"\n Read a CSV file, inverse the order of the lines and write the inverted lines back into the file. Then reset the cursor to the beginning of the file.\n\n Parameters:\n - filename (str): The name of the CSV file.\n\n Returns:\n - filename (str): The name of the CSV file.\n\n Requirements:\n - csv\n - sys\n\n Example:\n >>> f_533('file.csv')\n 'file.csv'\n \"\"\"", "canonical_solution": " try:\n with open(filename, 'r+') as file:\n reader = csv.reader(file)\n rows = list(reader)\n file.seek(0)\n file.truncate()\n\n writer = csv.writer(file)\n writer.writerows(reversed(rows))\n\n file.seek(0)\n except Exception as e:\n print(f\"An error occurred: {e}\", file=sys.stderr)\n\n return filename", "test": "import unittest\nimport os\nclass TestCases(unittest.TestCase):\n def base(self, filename, contents, expected):\n # Create file\n with open(filename, 'w') as file:\n file.write(contents)\n # Run function\n f_533(filename)\n # Check file\n with open(filename, 'r') as file:\n txt = file.read()\n self.assertEqual(txt, expected)\n # Remove file\n os.remove(filename)\n def test_case_1(self):\n self.base('file.csv', \"a,b\\nc,d\\ne,f\\ng,h\\n\", \"g,h\\ne,f\\nc,d\\na,b\\n\")\n \n def test_case_2(self):\n self.base('file.csv', \"a,b,c\\nd,e,f\\ng,h,i\\n\", \"g,h,i\\nd,e,f\\na,b,c\\n\")\n def test_case_3(self):\n self.base('file.csv', \"a,b,c,d\\ne,f,g,h\\ni,j,k,l\\n\", \"i,j,k,l\\ne,f,g,h\\na,b,c,d\\n\")\n \n def test_case_4(self):\n self.base('file.csv', \"a,b,c,d,e\\nf,g,h,i,j\\nk,l,m,n,o\\n\", \"k,l,m,n,o\\nf,g,h,i,j\\na,b,c,d,e\\n\")\n def test_case_5(self):\n self.base('file.csv', \"a,b,c,d,e,f\\ng,h,i,j,k,l\\nm,n,o,p,q,r\\n\", \"m,n,o,p,q,r\\ng,h,i,j,k,l\\na,b,c,d,e,f\\n\")", "apis": ["sys.stderr", "csv.writer", "csv.reader"], "libs": ["csv", "sys"], "doc": {"description": ["Read a CSV file, inverse the order of the lines and write the inverted lines back into the file. Then reset the cursor to the beginning of the file."], "note": [], "params": ["filename (str): The name of the CSV file."], "returns": ["filename (str): The name of the CSV file."], "reqs": ["csv", "sys"], "raises": [], "example": [">>> f_533('file.csv')", "'file.csv'"]}} -{"task_id": "f_908", "prompt": "import numpy as np\nimport seaborn as sns\n\n\ndef f_908(arr):\n \"\"\"\n Plots a heatmap of a given 2D numerical array and prints the sum of each row.\n The heatmap's color range is set based on the minimum and maximum values in the array.\n\n Parameters:\n arr (numpy.array): A 2D numpy array of numerical values.\n\n Returns:\n ax (matplotlib.axes.Axes): The Axes object with the plotted heatmap.\n\n Requirements:\n - numpy\n - seaborn\n\n Note:\n The function calculates the sum of each row and prints these values.\n The heatmap is plotted based on the original array with its color range set from the minimum to the maximum value in the array.\n\n Example:\n >>> arr = np.array([[i + j for i in range(3)] for j in range(5)])\n >>> ax = f_908(arr)\n >>> ax.get_title()\n 'Heatmap of the 2D Array'\n \"\"\"", "canonical_solution": " row_sums = arr.sum(axis=1)\n vmax = np.max(arr) # Set vmax to the maximum value in the array\n vmin = np.min(arr) # Set vmin to the minimum value in the array\n ax = sns.heatmap(\n arr, annot=True, vmax=vmax, vmin=vmin\n ) # Include both vmin and vmax in the heatmap call\n ax.set_title(\"Heatmap of the 2D Array\")\n\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_908.\"\"\"\n def tearDown(self):\n plt.clf()\n def test_scenario_1(self):\n \"\"\"Scenario 1: Testing with a 2D array created by adding row and column indices.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertEqual(ax.collections[0].colorbar.vmax, expected_vmax)\n def test_scenario_2(self):\n \"\"\"Scenario 2: Testing with a 2D array where each column has identical values based on the column index.\"\"\"\n arr = np.array([[i for i in range(3)] for j in range(5)])\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertEqual(ax.collections[0].colorbar.vmax, expected_vmax)\n def test_scenario_3(self):\n \"\"\"Scenario 3: Testing with a 2D array where each row has identical values based on the row index.\"\"\"\n arr = np.array([[j for i in range(3)] for j in range(5)])\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertEqual(ax.collections[0].colorbar.vmax, expected_vmax)\n def test_scenario_4(self):\n \"\"\"Scenario 4: Testing with a 2D array of zeros.\"\"\"\n arr = np.zeros((5, 3))\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertAlmostEqual(\n ax.collections[0].colorbar.vmax, expected_vmax, delta=0.2\n )\n def test_scenario_5(self):\n \"\"\"Scenario 5: Testing with a 2D array of ones.\"\"\"\n arr = np.ones((5, 3))\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertAlmostEqual(\n ax.collections[0].colorbar.vmax, expected_vmax, delta=0.2\n )", "apis": ["seaborn.heatmap", "numpy.min", "numpy.max"], "libs": ["numpy", "seaborn"], "doc": {"description": ["Plots a heatmap of a given 2D numerical array and prints the sum of each row.", "The heatmap's color range is set based on the minimum and maximum values in the array."], "note": ["The function calculates the sum of each row and prints these values.", "The heatmap is plotted based on the original array with its color range set from the minimum to the maximum value in the array."], "params": ["arr (numpy.array): A 2D numpy array of numerical values."], "returns": ["ax (matplotlib.axes.Axes): The Axes object with the plotted heatmap."], "reqs": ["numpy", "seaborn"], "raises": [], "example": [">>> arr = np.array([[i + j for i in range(3)] for j in range(5)])", ">>> ax = f_908(arr)", ">>> ax.get_title()", "'Heatmap of the 2D Array'"]}} -{"task_id": "f_792", "prompt": "import numpy as np\nimport pandas as pd\n\ndef f_792(rows, columns, seed=None):\n \"\"\"\n Generate a DataFrame with random values within a specified range.\n \n This function creates a matrix of given dimensions filled with random values between 0 and 1 and returns it as a Pandas DataFrame. Users have the option to set a random seed for reproducible results.\n \n Parameters:\n - rows (int): The number of rows for the matrix.\n - columns (int): The number of columns for the matrix.\n - seed (int, optional): The seed for the random number generator. Default is None.\n \n Returns:\n - DataFrame: A Pandas DataFrame containing the generated random values.\n \n Examples:\n >>> df = f_792(3, 2, seed=42)\n >>> print(df.shape)\n (3, 2)\n \n >>> df = f_792(1, 1, seed=24)\n >>> print(df.shape)\n (1, 1)\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n matrix = np.random.rand(rows, columns)\n df = pd.DataFrame(matrix)\n \n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def setUp(self):\n self.seed = 42\n def test_case_1(self):\n df = f_792(3, 2, seed=self.seed)\n self.assertEqual(df.shape, (3, 2))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())\n \n def test_case_2(self):\n df = f_792(5, 5, seed=self.seed)\n self.assertEqual(df.shape, (5, 5))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())\n \n def test_case_3(self):\n df = f_792(1, 1, seed=self.seed)\n self.assertEqual(df.shape, (1, 1))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())\n \n def test_case_4(self):\n df = f_792(4, 3, seed=self.seed)\n self.assertEqual(df.shape, (4, 3))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())\n \n def test_case_5(self):\n df = f_792(2, 2, seed=self.seed)\n self.assertEqual(df.shape, (2, 2))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())", "apis": ["numpy.random", "numpy.random.rand", "pandas.DataFrame", "numpy.random.seed"], "libs": ["numpy", "pandas"], "doc": {"description": ["Generate a DataFrame with random values within a specified range.", "This function creates a matrix of given dimensions filled with random values between 0 and 1 and returns it as a Pandas DataFrame. Users have the option to set a random seed for reproducible results.", ">>> df = f_792(1, 1, seed=24)", ">>> print(df.shape)", "(1, 1)"], "note": [], "params": ["rows (int): The number of rows for the matrix.", "columns (int): The number of columns for the matrix.", "seed (int, optional): The seed for the random number generator. Default is None."], "returns": ["DataFrame: A Pandas DataFrame containing the generated random values."], "reqs": [], "raises": [], "example": ["Examples:", ">>> df = f_792(3, 2, seed=42)", ">>> print(df.shape)", "(3, 2)"]}} -{"task_id": "f_540", "prompt": "import pandas as pd\nfrom collections import Counter\n\ndef f_540(df):\n \"\"\"\n Calculate the frequency of combinations of elements in a DataFrame.\n The function adds a 'combination' column to the DataFrame, which is the combination of items in each row.\n It then calculates the frequency of each combination.\n \n Parameters:\n - df (pandas.DataFrame): The input DataFrame with columns 'item1', 'item2', 'item3', 'item4', 'item5'.\n \n Returns:\n - dict: A dictionary containing the frequency of all combination.\n\n Requirements:\n - pandas\n - collections\n\n Example:\n >>> df = pd.DataFrame({'item1': ['a', 'b', 'a'], 'item2': ['b', 'c', 'b'], 'item3': ['c', 'd', 'c'], 'item4': ['d', 'e', 'd'], 'item5': ['e', 'f', 'e']})\n >>> f_540(df)\n {('a', 'b', 'c', 'd', 'e'): 2, ('b', 'c', 'd', 'e', 'f'): 1}\n \"\"\"", "canonical_solution": " df['combination'] = pd.Series(df.apply(lambda row: tuple(sorted(row)), axis=1))\n \n # Using Counter from collections to calculate the frequency of each combination\n combination_freq = Counter(df['combination'])\n \n return dict(combination_freq)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'item1': ['a', 'b', 'a'], 'item2': ['b', 'c', 'b'], 'item3': ['c', 'd', 'c'], 'item4': ['d', 'e', 'd'], 'item5': ['e', 'f', 'e']})\n freq = f_540(df)\n self.assertEqual(freq[('a', 'b', 'c', 'd', 'e')], 2)\n self.assertEqual(freq[('b', 'c', 'd', 'e', 'f')], 1)\n def test_case_2(self):\n df = pd.DataFrame({'item1': ['c', 'b', 'a'], 'item2': ['b', 'c', 'b'], 'item3': ['c', 'd', 'c'], 'item4': ['d', 'e', 'd'], 'item5': ['e', 'f', 'e']})\n freq = f_540(df)\n print(freq)\n self.assertEqual(freq[('a', 'b', 'c', 'd', 'e')], 1)\n self.assertEqual(freq[('b', 'c', 'd', 'e', 'f')], 1)\n if ('b', 'c', 'c', 'd', 'e') in freq:\n self.assertEqual(freq[('b', 'c', 'c', 'd', 'e')], 1)\n elif ('c', 'b', 'c', 'd', 'e') in freq:\n self.assertEqual(freq[('c', 'b', 'c', 'd', 'e')], 1)\n def test_case_3(self):\n df = pd.DataFrame({'item1': ['a'], 'item2': ['a'], 'item3': ['a'], 'item4': ['a'], 'item5': ['a']})\n freq = f_540(df)\n self.assertEqual(freq[('a', 'a', 'a', 'a', 'a')], 1)\n def test_case_4(self):\n df = pd.DataFrame({'item1': ['a', 'b', 'c'], 'item2': ['b', 'c', 'd'], 'item3': ['c', 'd', 'e'], 'item4': ['d', 'e', 'f'], 'item5': ['e', 'f', 'g']})\n freq = f_540(df)\n self.assertEqual(freq[('a', 'b', 'c', 'd', 'e')], 1)\n self.assertEqual(freq[('b', 'c', 'd', 'e', 'f')], 1)\n self.assertEqual(freq[('c', 'd', 'e', 'f', 'g')], 1)\n def test_case_5(self):\n df = pd.DataFrame({'item1': ['a', 'a', 'a'], 'item2': ['b', 'b', 'b'], 'item3': ['c', 'c', 'c'], 'item4': ['d', 'd', 'd'], 'item5': ['e', 'e', 'e']})\n freq = f_540(df)\n self.assertEqual(freq[('a', 'b', 'c', 'd', 'e')], 3)", "apis": ["pandas.Series", "collections.Counter"], "libs": ["pandas", "collections"], "doc": {"description": ["Calculate the frequency of combinations of elements in a DataFrame.", "The function adds a 'combination' column to the DataFrame, which is the combination of items in each row.", "It then calculates the frequency of each combination."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame with columns 'item1', 'item2', 'item3', 'item4', 'item5'."], "returns": ["dict: A dictionary containing the frequency of all combination."], "reqs": ["pandas", "collections"], "raises": [], "example": [">>> df = pd.DataFrame({'item1': ['a', 'b', 'a'], 'item2': ['b', 'c', 'b'], 'item3': ['c', 'd', 'c'], 'item4': ['d', 'e', 'd'], 'item5': ['e', 'f', 'e']})", ">>> f_540(df)", "{('a', 'b', 'c', 'd', 'e'): 2, ('b', 'c', 'd', 'e', 'f'): 1}"]}} -{"task_id": "f_905", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import norm\n\n\ndef f_905(arr: np.ndarray) -> (plt.Axes, np.ndarray):\n \"\"\"\n Plots a histogram of normalized data from an input 2D numpy array alongside the probability density function (PDF)\n of a standard normal distribution.\n\n Note:\n - Takes in a 2D numpy array as input.\n - Calculates the sum of elements in each row of the array.\n - Normalizes these row sums to have a mean of 0 and a standard deviation of 1.\n - Normalization is achieved by first calculating the mean and standard deviation of the row sums.\n - Each row sum is then transformed by subtracting the mean and dividing by the standard deviation.\n - If the standard deviation is 0 (indicating all row sums are equal), normalization results in an array of zeros with the same shape.\n - Plots a histogram of the normalized data.\n - Uses 30 bins for the histogram.\n - The histogram is density-based, meaning it represents the probability density rather than raw frequencies.\n - The bars of the histogram are semi-transparent (60% opacity) and green in color.\n - Overlays the PDF of a standard normal distribution on the histogram for comparison.\n - The PDF curve is plotted in red with a line width of 2.\n - The range of the PDF curve is set to cover 99% of a standard normal distribution.\n - Sets the title of the plot to \"Histogram of Normalized Data with Standard Normal PDF\".\n\n Parameters:\n - arr: A 2D numpy array. The array should contain numerical data.\n\n Returns:\n - A tuple containing:\n - A matplotlib Axes object with the histogram of the normalized data and the overlaid standard normal PDF.\n - The normalized data as a 1D numpy array.\n\n Requirements:\n - numpy\n - scipy\n - matplotlib\n\n Example:\n >>> ax, normalized_data = f_905(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))\n >>> type(ax)\n \n >>> print(normalized_data)\n [-1.22474487 0. 1.22474487]\n \"\"\"", "canonical_solution": " # Calculating row sums\n row_sums = arr.sum(axis=1)\n\n # Normalizing the data\n mean = np.mean(row_sums)\n std_dev = np.std(row_sums)\n normalized_data = (\n (row_sums - mean) / std_dev if std_dev != 0 else np.zeros_like(row_sums)\n )\n\n # Plotting the histogram\n _, ax = plt.subplots()\n ax.hist(normalized_data, bins=30, density=True, alpha=0.6, color=\"g\")\n\n # Plotting the PDF of a standard normal distribution\n x = np.linspace(norm.ppf(0.01), norm.ppf(0.99), 100)\n ax.plot(x, norm.pdf(x), \"r-\", lw=2)\n ax.set_title(\"Histogram of Normalized Data with Standard Normal PDF\")\n\n return ax, normalized_data", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for `f_905`.\"\"\"\n def test_histogram_and_pdf(self):\n \"\"\"Test that the histogram and PDF are plotted.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n ax, _ = f_905(arr)\n self.assertEqual(\n ax.get_title(),\n \"Histogram of Normalized Data with Standard Normal PDF\",\n )\n self.assertEqual(len(ax.lines), 1)\n self.assertEqual(len(ax.patches), 30)\n def test_normalized_data(self):\n \"\"\"Test that the normalized data is correct.\"\"\"\n arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n _, normalized_data = f_905(arr)\n expected_data = [-1.22474487, 0.0, 1.22474487]\n for i in range(len(expected_data)):\n self.assertTrue(np.isclose(normalized_data[i], expected_data[i]))\n def test_empty_array(self):\n \"\"\"Test empty array.\"\"\"\n arr = np.array([[], [], []])\n _, normalized_data = f_905(arr)\n for value in normalized_data:\n self.assertTrue(np.isclose(value, 0))\n def test_single_value_array(self):\n \"\"\"Test single value array.\"\"\"\n arr = np.array([[5], [5], [5]])\n _, normalized_data = f_905(arr)\n for value in normalized_data:\n self.assertTrue(np.isclose(value, 0))\n def test_large_values(self):\n \"\"\"Test large values.\"\"\"\n arr = np.array([[1e6, 2e6, 3e6], [4e6, 5e6, 6e6], [7e6, 8e6, 9e6]])\n _, normalized_data = f_905(arr)\n expected_data = [-1.22474487, 0.0, 1.22474487]\n for i in range(len(expected_data)):\n self.assertTrue(np.isclose(normalized_data[i], expected_data[i]))", "apis": ["scipy.stats.norm.pdf", "numpy.ndarray", "scipy.stats.norm.ppf", "numpy.std", "numpy.mean", "matplotlib.pyplot.Axes", "numpy.linspace", "matplotlib.pyplot.subplots", "numpy.zeros_like"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Plots a histogram of normalized data from an input 2D numpy array alongside the probability density function (PDF)", "of a standard normal distribution."], "note": ["Takes in a 2D numpy array as input.", "Calculates the sum of elements in each row of the array.", "Normalizes these row sums to have a mean of 0 and a standard deviation of 1.", "Normalization is achieved by first calculating the mean and standard deviation of the row sums.", "Each row sum is then transformed by subtracting the mean and dividing by the standard deviation.", "If the standard deviation is 0 (indicating all row sums are equal), normalization results in an array of zeros with the same shape.", "Plots a histogram of the normalized data.", "Uses 30 bins for the histogram.", "The histogram is density-based, meaning it represents the probability density rather than raw frequencies.", "The bars of the histogram are semi-transparent (60% opacity) and green in color.", "Overlays the PDF of a standard normal distribution on the histogram for comparison.", "The PDF curve is plotted in red with a line width of 2.", "The range of the PDF curve is set to cover 99% of a standard normal distribution.", "Sets the title of the plot to \"Histogram of Normalized Data with Standard Normal PDF\"."], "params": ["arr: A 2D numpy array. The array should contain numerical data."], "returns": ["A tuple containing:", "A matplotlib Axes object with the histogram of the normalized data and the overlaid standard normal PDF.", "The normalized data as a 1D numpy array."], "reqs": ["numpy", "scipy", "matplotlib"], "raises": [], "example": [">>> ax, normalized_data = f_905(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))", ">>> type(ax)", "", ">>> print(normalized_data)", "[-1.22474487 0. 1.22474487]"]}} -{"task_id": "f_345", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_345(P, T):\n \"\"\"\n Calculate the product of matrix \"P\" and 3D tensor \"T\" then return dataframe of normalized results.\n\n This function performs matrix-tensor multiplication between a matrix \"P\" and a 3D tensor \"T\" using numpy.\n It checks if the shapes of P and T are compatible for multiplication, raising a ValueError if they are not.\n The function then normalizes the resulting 2D array using sklearn's StandardScaler. The final output\n is returned as a pandas DataFrame, with columns named feature_0, feature_1, ..., feature_n,\n where n is the number of features in the flattened result of the matrix-tensor multiplication.\n\n Parameters:\n - P (numpy.ndarray): The input matrix. Must not be empty.\n - T (numpy.ndarray): The input tensor. Must not be empty.\n\n Returns:\n pandas.DataFrame: A DataFrame with the normalized result.\n\n Requirements:\n - numpy\n - pandas\n - sklearn.preprocessing\n\n Example:\n >>> np.random.seed(0)\n >>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])\n >>> T = np.random.rand(3, 5, 5)\n >>> result = f_345(P, T)\n >>> type(result)\n \n >>> result.head(2)\n feature_0 feature_1 feature_2 ... feature_22 feature_23 feature_24\n 0 0.214791 0.220904 1.697850 ... 1.768847 -1.759510 -0.003527\n 1 -0.652336 1.064228 -0.707134 ... -0.036116 1.002544 -0.813796\n \n [2 rows x 25 columns]\n \"\"\"", "canonical_solution": " if P.size == 0 or T.size == 0:\n raise ValueError(\"Inputs cannot be empty.\")\n if P.shape[1] != T.shape[0]:\n raise ValueError(\n f\"Matrix P shape {P.shape[1]} and Tensor T shape {T.shape[0]} are incompatible for tensor multiplication.\"\n )\n\n result = np.tensordot(P, T, axes=[1, 0]).swapaxes(0, 1)\n result = result.reshape(result.shape[0], -1)\n\n scaler = StandardScaler()\n result = scaler.fit_transform(result)\n\n adjusted_feature_names = [f\"feature_{i}\" for i in range(result.shape[1])]\n result = pd.DataFrame(result, columns=adjusted_feature_names)\n\n return result", "test": "import unittest\nimport numpy as np\nfrom sklearn.preprocessing import StandardScaler\nclass TestCases(unittest.TestCase):\n def tensor_product_manual(self, P, T):\n \"\"\"Manually compute the tensor product without any normalization.\"\"\"\n result = np.tensordot(P, T, axes=[1, 0]).swapaxes(0, 1)\n result = result.reshape(result.shape[0], -1)\n return result\n def test_case_1(self):\n np.random.seed(0)\n P = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n T = np.random.rand(3, 4, 4)\n result = f_345(P, T)\n manual_result = self.tensor_product_manual(P, T)\n # Reverse normalization for comparison\n scaler = StandardScaler().fit(manual_result)\n reversed_result = scaler.inverse_transform(result)\n self.assertEqual(result.shape, (4, 12))\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.allclose(manual_result, reversed_result, atol=1e-5))\n def test_case_2(self):\n np.random.seed(0)\n P = np.array([[1, 2], [3, 4], [5, 6]])\n T = np.random.rand(3, 5, 5)\n with self.assertRaises(ValueError):\n f_345(P, T)\n def test_case_3(self):\n np.random.seed(0)\n P = np.eye(4)\n T = np.random.rand(4, 6, 6)\n result = f_345(P, T)\n manual_result = self.tensor_product_manual(P, T)\n # Reverse normalization for comparison\n scaler = StandardScaler().fit(manual_result)\n reversed_result = scaler.inverse_transform(result)\n self.assertEqual(result.shape, (6, 24))\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.allclose(manual_result, reversed_result, atol=1e-5))\n def test_case_4(self):\n np.random.seed(0)\n P = np.ones((5, 5))\n T = np.random.rand(5, 7, 7)\n result = f_345(P, T)\n manual_result = self.tensor_product_manual(P, T)\n # Reverse normalization for comparison\n scaler = StandardScaler().fit(manual_result)\n reversed_result = scaler.inverse_transform(result)\n self.assertEqual(result.shape, (7, 35))\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.allclose(manual_result, reversed_result, atol=1e-5))\n def test_case_5(self):\n np.random.seed(0)\n P = np.diag(np.arange(1, 7))\n T = np.random.rand(6, 8, 8)\n result = f_345(P, T)\n manual_result = self.tensor_product_manual(P, T)\n # Reverse normalization for comparison\n scaler = StandardScaler().fit(manual_result)\n reversed_result = scaler.inverse_transform(result)\n self.assertEqual(result.shape, (8, 48))\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.allclose(manual_result, reversed_result, atol=1e-5))\n def test_case_6(self):\n # Test with an empty matrix and tensor, expecting a ValueError due to incompatible shapes\n P = np.array([])\n T = np.array([])\n with self.assertRaises(ValueError):\n f_345(P, T)\n def test_case_7(self):\n # Test with non-numeric inputs in matrices/tensors to verify type handling\n P = np.array([[\"a\", \"b\"], [\"c\", \"d\"]])\n T = np.random.rand(2, 2, 2)\n with self.assertRaises(Exception):\n f_345(P, T)\n def test_case_8(self):\n # Test with zero matrix and tensor to verify handling of all-zero inputs\n P = np.zeros((5, 5))\n T = np.zeros((5, 3, 3))\n result = f_345(P, T)\n self.assertTrue(np.allclose(result, np.zeros((3, 15))))\n def test_case_9(self):\n # Test DataFrame output for correct column names, ensuring they match expected feature naming convention\n P = np.random.rand(3, 3)\n T = np.random.rand(3, 4, 4)\n result = f_345(P, T)\n expected_columns = [\n \"feature_0\",\n \"feature_1\",\n \"feature_2\",\n \"feature_3\",\n \"feature_4\",\n \"feature_5\",\n \"feature_6\",\n \"feature_7\",\n \"feature_8\",\n \"feature_9\",\n \"feature_10\",\n \"feature_11\",\n ]\n self.assertListEqual(list(result.columns), expected_columns)\n def test_case_10(self):\n # Test to ensure DataFrame indices start from 0 and are sequential integers\n P = np.random.rand(2, 3)\n T = np.random.rand(3, 5, 5)\n result = f_345(P, T)\n expected_indices = list(range(5)) # Expected indices for 5 rows\n self.assertListEqual(list(result.index), expected_indices)", "apis": ["pandas.DataFrame", "numpy.tensordot", "sklearn.preprocessing.StandardScaler"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Calculate the product of matrix \"P\" and 3D tensor \"T\" then return dataframe of normalized results.", "This function performs matrix-tensor multiplication between a matrix \"P\" and a 3D tensor \"T\" using numpy.", "It checks if the shapes of P and T are compatible for multiplication, raising a ValueError if they are not.", "The function then normalizes the resulting 2D array using sklearn's StandardScaler. The final output", "is returned as a pandas DataFrame, with columns named feature_0, feature_1, ..., feature_n,", "where n is the number of features in the flattened result of the matrix-tensor multiplication."], "note": [], "params": ["P (numpy.ndarray): The input matrix. Must not be empty.", "T (numpy.ndarray): The input tensor. Must not be empty."], "returns": ["pandas.DataFrame: A DataFrame with the normalized result."], "reqs": ["numpy", "pandas", "sklearn.preprocessing"], "raises": [], "example": [">>> np.random.seed(0)", ">>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])", ">>> T = np.random.rand(3, 5, 5)", ">>> result = f_345(P, T)", ">>> type(result)", "", ">>> result.head(2)", "feature_0 feature_1 feature_2 ... feature_22 feature_23 feature_24", "0 0.214791 0.220904 1.697850 ... 1.768847 -1.759510 -0.003527", "1 -0.652336 1.064228 -0.707134 ... -0.036116 1.002544 -0.813796", "", "[2 rows x 25 columns]"]}} -{"task_id": "f_850", "prompt": "import requests\nimport pandas as pd\n\n\ndef f_850(url: str) -> pd.DataFrame:\n \"\"\"\n This function fetches JSON data from a specified URL and converts it into a Pandas DataFrame.\n It expects the JSON to be in a format that is directly convertible to a DataFrame, typically\n a list of dictionaries. The function handles various scenarios including successful data\n retrieval and conversion, network issues, and invalid JSON format.\n\n Parameters:\n - url (str): The URL where the JSON file is located.\n\n Returns:\n - pd.DataFrame: A DataFrame constructed from the JSON data fetched from the URL.\n\n Raises:\n - SystemError: If there is a network-related issue such as a connection error, timeout,\n or if the server responded with an unsuccessful status code (like 404 or 500). This is a\n re-raised exception from requests.RequestException to provide a more specific error message.\n - ValueError: If the fetched data is not in a valid JSON format that can be converted into\n a DataFrame. This could occur if the data structure does not match the expected format (e.g.,\n not a list of dictionaries).\n\n Requirements:\n - requests\n - pandas\n\n Example:\n >>> f_850('https://example.com/data.json')\n DataFrame:\n A B\n\n Notes:\n - The function uses a timeout of 5 seconds for the network request to avoid hanging indefinitely.\n - It checks the HTTP response status and raises an HTTPError for unsuccessful status codes.\n - Directly converts the HTTP response to JSON and then to a DataFrame, without intermediate processing.\n \"\"\"", "canonical_solution": " try:\n response = requests.get(url, timeout=5)\n response.raise_for_status() # Raises an HTTPError if the HTTP request returned an unsuccessful status code\n data = response.json() # Directly converts the response content to JSON\n df = pd.DataFrame(data)\n return df\n except requests.RequestException as e:\n raise SystemError(f\"Network error occurred: {e}\") from e\n except ValueError as exc:\n raise ValueError(\"Invalid JSON format for DataFrame conversion\") from exc", "test": "import unittest\nimport requests\nimport pandas as pd\nfrom unittest.mock import patch\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_850.\"\"\"\n @patch(\"requests.get\")\n def test_valid_json(self, mock_get):\n \"\"\"Test a valid JSON.\"\"\"\n mock_get.return_value.json.return_value = [{\"A\": 1, \"B\": 3}, {\"A\": 2, \"B\": 4}]\n mock_get.return_value.status_code = 200\n df = f_850(\"https://example.com/data.json\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertListEqual(df.columns.tolist(), [\"A\", \"B\"])\n self.assertListEqual(df[\"A\"].tolist(), [1, 2])\n self.assertListEqual(df[\"B\"].tolist(), [3, 4])\n @patch(\"requests.get\")\n def test_empty_json(self, mock_get):\n \"\"\"Test an empty JSON.\"\"\"\n mock_get.return_value.json.return_value = []\n mock_get.return_value.status_code = 200\n df = f_850(\"https://example.com/empty.json\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(len(df), 0)\n @patch(\"requests.get\")\n def test_invalid_json(self, mock_get):\n \"\"\"Test an invalid JSON.\"\"\"\n mock_get.return_value.json.side_effect = ValueError()\n with self.assertRaises(ValueError):\n f_850(\"https://example.com/invalid.json\")\n @patch(\"requests.get\")\n def test_large_json(self, mock_get):\n \"\"\"Test a large JSON.\"\"\"\n mock_get.return_value.json.return_value = [{\"X\": i} for i in range(1000)]\n mock_get.return_value.status_code = 200\n df = f_850(\"https://example.com/large.json\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertListEqual(df[\"X\"].tolist(), list(range(1000)))\n @patch(\"requests.get\")\n def test_null_json(self, mock_get):\n \"\"\"Test a JSON that is null.\"\"\"\n mock_get.return_value.json.return_value = None\n mock_get.return_value.status_code = 200\n df = f_850(\"https://example.com/null.json\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(len(df), 0)\n @patch(\"requests.get\")\n def test_system_error(self, mock_get):\n \"\"\"Test a general error.\"\"\"\n mock_get.side_effect = requests.RequestException\n with self.assertRaises(SystemError):\n f_850(\"https://example.com/data.json\")", "apis": ["pandas.DataFrame", "requests.get", "requests.RequestException"], "libs": ["pandas", "requests"], "doc": {"description": ["This function fetches JSON data from a specified URL and converts it into a Pandas DataFrame.", "It expects the JSON to be in a format that is directly convertible to a DataFrame, typically", "a list of dictionaries. The function handles various scenarios including successful data", "retrieval and conversion, network issues, and invalid JSON format.", "Notes:", "- The function uses a timeout of 5 seconds for the network request to avoid hanging indefinitely.", "- It checks the HTTP response status and raises an HTTPError for unsuccessful status codes.", "- Directly converts the HTTP response to JSON and then to a DataFrame, without intermediate processing."], "note": [], "params": ["url (str): The URL where the JSON file is located."], "returns": ["pd.DataFrame: A DataFrame constructed from the JSON data fetched from the URL."], "reqs": ["requests", "pandas"], "raises": ["SystemError: If there is a network-related issue such as a connection error, timeout,", "or if the server responded with an unsuccessful status code (like 404 or 500). This is a", "re-raised exception from requests.RequestException to provide a more specific error message.", "ValueError: If the fetched data is not in a valid JSON format that can be converted into", "a DataFrame. This could occur if the data structure does not match the expected format (e.g.,", "not a list of dictionaries)."], "example": [">>> f_850('https://example.com/data.json')", "DataFrame:", "A B"]}} -{"task_id": "f_915", "prompt": "import matplotlib.pyplot as plt\nfrom itertools import cycle\nimport numpy as np\nfrom random import shuffle\n\nCOLORS = [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"]\n\n\ndef f_915(list_of_lists):\n \"\"\"\n Plots a series of lines for each list in `list_of_lists`. Each line is plotted with shuffled y-values\n and sequential x-values starting from 1. The function shuffles the y-values of each inner list before plotting.\n Each line is plotted with a different color from a predetermined set of colors. The function cycles through \n these colors for each inner list.\n\n Parameters:\n - list_of_lists (list of list): A list of lists where each inner\n list represents a set of y-values to be shuffled and plotted. The x-values are automatically\n generated as a sequence starting from 1 up to the length of the inner list.\n\n Returns:\n - tuple: A tuple containing the figure and axes objects of the plotted graph.\n\n Requirements:\n - matplotlib\n - itertools\n - numpy\n - random\n\n Example:\n >>> import random\n >>> random.seed(0)\n >>> fig, ax = f_915([[1, 2, 3], [4, 5, 6]])\n >>> ax.lines[0].get_color()\n (0.0, 0.0, 1.0, 1)\n\n Note:\n - If an inner list is empty, it will be skipped and no line will be plotted for it.\n - The colors are reused cyclically if there are more inner lists than colors available.\n - The shuffling of y-values is random and different each time the function is called,\n unless a random seed is set externally.\n - The function uses a default set of colors defined in the COLORS constant.\n \"\"\"", "canonical_solution": " fig, ax = plt.subplots()\n color_cycle = cycle(COLORS)\n\n for list_ in list_of_lists:\n y_values = np.arange(1, len(list_) + 1)\n shuffle(y_values)\n ax.plot(y_values, next(color_cycle))\n\n return fig, ax", "test": "import unittest\nfrom matplotlib.figure import Figure\nfrom matplotlib.axes import Axes\nimport matplotlib.colors as mcolors\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_915.\"\"\"\n def test_return_types(self):\n \"\"\"Check that the function returns the correct types.\"\"\"\n random.seed(0)\n fig, ax = f_915([[\"x\", \"y\", \"z\"], [\"a\", \"b\", \"c\"]])\n self.assertIsInstance(\n fig,\n Figure,\n \"The first return value should be an instance of matplotlib.figure.Figure.\",\n )\n self.assertIsInstance(\n ax,\n Axes,\n \"The second return value should be an instance of matplotlib.axes._subplots.AxesSubplot.\",\n )\n def test_number_of_lines(self):\n \"\"\"Check that the correct number of lines are plotted.\"\"\"\n random.seed(1)\n _, ax = f_915([[\"x\", \"y\", \"z\"], [\"a\", \"b\", \"c\"]])\n self.assertEqual(\n len(ax.lines), 2, \"There should be 2 lines plotted for 2 lists.\"\n )\n _, ax = f_915([[\"x\", \"y\", \"z\"]])\n self.assertEqual(len(ax.lines), 1, \"There should be 1 line plotted for 1 list.\")\n def test_color_cycle(self):\n \"\"\"Check that the colors of the plotted lines follow the specified cycle.\"\"\"\n random.seed(2)\n _, ax = f_915([[\"x\"], [\"y\"], [\"z\"], [\"a\"], [\"b\"], [\"c\"], [\"d\"], [\"e\"]])\n expected_colors = [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\", \"b\"]\n # Convert color codes to RGBA format\n expected_colors_rgba = [mcolors.to_rgba(c) for c in expected_colors]\n actual_colors_rgba = [line.get_color() for line in ax.lines]\n self.assertEqual(\n actual_colors_rgba,\n expected_colors_rgba,\n \"The colors of the plotted lines should follow the specified cycle.\",\n )\n def test_y_values(self):\n \"\"\"Check that the y-values are shuffled.\"\"\"\n random.seed(3)\n _, ax = f_915([[\"x\", \"y\", \"z\"]])\n y_data = ax.lines[0].get_ydata()\n self.assertTrue(\n set(y_data) == {1, 2, 3},\n \"The y-values should be shuffled numbers from the range [1, len(list)].\",\n )\n def test_empty_input(self):\n \"\"\"Check that no lines are plotted for an empty input list.\"\"\"\n random.seed(4)\n _, ax = f_915([])\n self.assertEqual(\n len(ax.lines),\n 0,\n \"There should be no lines plotted for an empty input list.\",\n )", "apis": ["numpy.arange", "random.shuffle", "itertools.cycle", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "random", "itertools"], "doc": {"description": ["Plots a series of lines for each list in `list_of_lists`. Each line is plotted with shuffled y-values", "and sequential x-values starting from 1. The function shuffles the y-values of each inner list before plotting.", "Each line is plotted with a different color from a predetermined set of colors. The function cycles through", "these colors for each inner list."], "note": ["If an inner list is empty, it will be skipped and no line will be plotted for it.", "The colors are reused cyclically if there are more inner lists than colors available.", "The shuffling of y-values is random and different each time the function is called,", "unless a random seed is set externally.", "The function uses a default set of colors defined in the COLORS constant."], "params": ["list_of_lists (list of list): A list of lists where each inner", "list represents a set of y-values to be shuffled and plotted. The x-values are automatically", "generated as a sequence starting from 1 up to the length of the inner list."], "returns": ["tuple: A tuple containing the figure and axes objects of the plotted graph."], "reqs": ["matplotlib", "itertools", "numpy", "random"], "raises": [], "example": [">>> import random", ">>> random.seed(0)", ">>> fig, ax = f_915([[1, 2, 3], [4, 5, 6]])", ">>> ax.lines[0].get_color()", "(0.0, 0.0, 1.0, 1)"]}} -{"task_id": "f_810", "prompt": "import numpy as np\nfrom scipy import integrate\nimport matplotlib.pyplot as plt\n\n\ndef f_810(func, x_range=(-2, 2), num_points=1000):\n \"\"\"\n Calculates and plots both a given function and its cumulative integral over a specified range,\n using a linearly spaced range of x-values.\n\n Parameters:\n func (function): A function of a single variable to integrate and plot.\n x_range (tuple, optional): The range (start, end) over which to evaluate `func`. Defaults to (-2, 2).\n num_points (int, optional): Number of points to generate in `x_range`. Defaults to 1000.\n\n Returns:\n matplotlib.axes.Axes: The Axes object containing the plots of the function and its integral.\n\n Requirements:\n - numpy\n - scipy\n - matplotlib\n\n Note:\n - The plot includes a legend and labels for the x and y axes that include the function's name.\n\n Example:\n >>> ax = f_810(np.sin)\n >>> type(ax)\n \n >>> ax.get_legend_handles_labels()[-1]\n ['sin(x)', 'Integral of sin(x)']\n \"\"\"", "canonical_solution": " X = np.linspace(x_range[0], x_range[1], num_points)\n y = func(X)\n y_int = integrate.cumulative_trapezoid(y, X, initial=0)\n\n fig, ax = plt.subplots()\n ax.plot(X, y, label=f\"{func.__name__}(x)\")\n ax.plot(X, y_int, label=f\"Integral of {func.__name__}(x)\")\n ax.legend()\n\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n plt.close(\"all\")\n def helper_assert_plot_attributes(self, func):\n # Test plot attributes are as expected\n ax = f_810(func)\n function_name = func.__name__\n legend_labels = ax.get_legend_handles_labels()[-1]\n self.assertIsInstance(ax, Axes)\n self.assertIn(function_name, legend_labels[0])\n self.assertIn(function_name, legend_labels[1])\n def test_case_1(self):\n # Test basic case in docstring\n ax = f_810(np.sin)\n self.helper_assert_plot_attributes(np.sin)\n def test_case_2(self):\n # Test other functions - numpy\n for func in [np.cos, np.exp]:\n ax = f_810(func)\n self.helper_assert_plot_attributes(func)\n def test_case_3(self):\n # Test other functions - lambda\n func = lambda x: x ** 2\n ax = f_810(func)\n self.helper_assert_plot_attributes(func)\n def test_case_4(self):\n # Test custom range and points\n ax = f_810(np.cos, x_range=(0, np.pi), num_points=500)\n self.assertEqual(len(ax.lines[0].get_xdata()), 500)\n self.assertEqual(ax.lines[0].get_xdata()[0], 0)\n self.assertEqual(ax.lines[0].get_xdata()[-1], np.pi)\n def test_case_5(self):\n # Test correct integral calculation\n # Test integral of x^2 in the range [0,1], should be close to 1/3\n func = lambda x: x ** 2\n X = np.linspace(0, 1, 1000)\n expected_integral = 1 / 3 * X ** 3 # Analytical integral of x^2\n ax = f_810(func, x_range=(0, 1), num_points=1000)\n computed_integral = ax.lines[1].get_ydata()[\n -1\n ] # Last value of the computed integral\n self.assertAlmostEqual(computed_integral, expected_integral[-1], places=4)", "apis": ["scipy.integrate.cumulative_trapezoid", "numpy.linspace", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Calculates and plots both a given function and its cumulative integral over a specified range,", "using a linearly spaced range of x-values."], "note": ["The plot includes a legend and labels for the x and y axes that include the function's name."], "params": ["func (function): A function of a single variable to integrate and plot.", "x_range (tuple, optional): The range (start, end) over which to evaluate `func`. Defaults to (-2, 2).", "num_points (int, optional): Number of points to generate in `x_range`. Defaults to 1000."], "returns": ["matplotlib.axes.Axes: The Axes object containing the plots of the function and its integral."], "reqs": ["numpy", "scipy", "matplotlib"], "raises": [], "example": [">>> ax = f_810(np.sin)", ">>> type(ax)", "", ">>> ax.get_legend_handles_labels()[-1]", "['sin(x)', 'Integral of sin(x)']"]}} -{"task_id": "f_404", "prompt": "import pandas as pd\nimport numpy as np\nimport statsmodels.api as sm\n\n\ndef f_404(\n array: list, random_seed: int = 0\n) -> (pd.DataFrame, sm.regression.linear_model.RegressionResultsWrapper):\n \"\"\"\n Generate a Pandas DataFrame from a 2D list and perform a multiple linear regression.\n\n The function first validates the input list, creates a DataFrame, separates independent and dependent variables,\n adds a constant to the model, and fits a linear regression using statsmodels.\n\n Parameters:\n - array (list of list of int): A 2D list where each sub-list represents a row of data.\n Each sub-list should have exactly 5 elements, where the first 4 elements are\n treated as independent variables ('A', 'B', 'C', 'D') and the last element is\n the dependent (Response) variable.\n\n - random_seed (int): A seed for reproducibility in numpy for statsmodels. Defaults to 0.\n\n Returns:\n - df (pd.DataFrame): DataFrame with columns 'A', 'B', 'C', 'D', 'Response'.\n - results (statsmodels.RegressionResults): Results of the linear regression.\n\n Requirements:\n - pandas\n - numpy\n - statsmodels.api.sm\n\n Example:\n >>> df, results = f_404([[1,2,3,4,5], [6,7,8,9,10]])\n >>> print(df)\n A B C D Response\n 0 1 2 3 4 5\n 1 6 7 8 9 10\n \"\"\"", "canonical_solution": " COLUMNS = [\"A\", \"B\", \"C\", \"D\", \"Response\"]\n\n np.random.seed(random_seed)\n\n if not all(len(row) == len(COLUMNS) for row in array):\n raise ValueError(\n \"Each sub-list in the input 2D list must have exactly 5 elements.\"\n )\n\n df = pd.DataFrame(array, columns=COLUMNS)\n X = df[COLUMNS[:-1]]\n y = df[\"Response\"]\n X = sm.add_constant(X)\n\n model = sm.OLS(y, X)\n results = model.fit()\n\n return df, results", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Testing dataframe creation, model accuracy, and parameters with various numeric data types\n test_data = [\n ([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]], 42, 1.0), # Positive values\n ([[-1, -2, -3, -4, -5], [-6, -7, -8, -9, -10]], 42, 1.0), # Negative values\n (\n [[100, 200, 300, 400, 500], [600, 700, 800, 900, 1000]],\n 42,\n 1.0,\n ), # Large values\n ]\n for array, random_seed, expected_r2 in test_data:\n with self.subTest(array=array):\n df, results = f_404(array, random_seed=random_seed)\n expected_df = pd.DataFrame(\n array, columns=[\"A\", \"B\", \"C\", \"D\", \"Response\"]\n )\n self.assertTrue(df.equals(expected_df))\n self.assertAlmostEqual(results.rsquared, expected_r2, places=2)\n for param in results.params:\n self.assertNotEqual(param, 0)\n def test_case_2(self):\n # Testing with more rows in the 2D list to ensure model scalability and consistency\n random_seed = 42\n array = [\n [1, 2, 3, 4, 5],\n [6, 7, 8, 9, 10],\n [11, 12, 13, 14, 15],\n [16, 17, 18, 19, 20],\n ]\n df, results = f_404(array, random_seed=random_seed)\n expected_df = pd.DataFrame(array, columns=[\"A\", \"B\", \"C\", \"D\", \"Response\"])\n self.assertTrue(df.equals(expected_df))\n self.assertAlmostEqual(results.rsquared, 1.0, places=2)\n for param in results.params:\n self.assertNotEqual(param, 0)\n def test_case_3(self):\n # Testing input validation for incorrect number of columns in a row\n array = [[1, 2, 3, 4], [5, 6, 7, 8]] # Missing dependent variable\n with self.assertRaises(ValueError):\n f_404(array)\n def test_case_4(self):\n # Testing handling of non-numeric values to ensure type safety\n array = [[\"a\", \"b\", \"c\", \"d\", \"e\"]] # All elements as strings\n with self.assertRaises(ValueError):\n df, results = f_404(array)\n # This assumes the function is modified to catch and raise ValueError for non-numeric inputs\n def test_case_5(self):\n # Testing reproducibility by using the same random_seed\n array = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]\n random_seed = 123\n df1, results1 = f_404(array, random_seed=random_seed)\n df2, results2 = f_404(array, random_seed=random_seed)\n self.assertTrue(df1.equals(df2))\n self.assertEqual(results1.params.tolist(), results2.params.tolist())\n def test_case_6(self):\n # Testing with an empty array to check function's handling of no input data\n array = []\n with self.assertRaises(ValueError):\n f_404(array)", "apis": ["statsmodels.api.add_constant", "numpy.random", "statsmodels.api.OLS", "numpy.random.seed", "statsmodels.api.regression", "pandas.DataFrame"], "libs": ["numpy", "pandas", "statsmodels"], "doc": {"description": ["Generate a Pandas DataFrame from a 2D list and perform a multiple linear regression.", "The function first validates the input list, creates a DataFrame, separates independent and dependent variables,", "adds a constant to the model, and fits a linear regression using statsmodels.", "- random_seed (int): A seed for reproducibility in numpy for statsmodels. Defaults to 0."], "note": [], "params": ["array (list of list of int): A 2D list where each sub-list represents a row of data.", "Each sub-list should have exactly 5 elements, where the first 4 elements are", "treated as independent variables ('A', 'B', 'C', 'D') and the last element is", "the dependent (Response) variable."], "returns": ["df (pd.DataFrame): DataFrame with columns 'A', 'B', 'C', 'D', 'Response'.", "results (statsmodels.RegressionResults): Results of the linear regression."], "reqs": ["pandas", "numpy", "statsmodels.api.sm"], "raises": [], "example": [">>> df, results = f_404([[1,2,3,4,5], [6,7,8,9,10]])", ">>> print(df)", "A B C D Response", "0 1 2 3 4 5", "1 6 7 8 9 10"]}} -{"task_id": "f_370", "prompt": "from collections import Counter\nimport pandas as pd\n\n\ndef f_370(myList):\n \"\"\"\n Count the frequency of each word in a list and return a DataFrame of words and their number.\n\n Parameters:\n myList (list): List of strings. Each string is considered a word regardless of its content,\n however the function is case insensitive, and it removes\n leading and trailing whitespaces. If empty, function returns\n a DataFrame with a Count column that is otherwise empty.\n\n Returns:\n DataFrame: A pandas DataFrame with words and their counts.\n\n Requirements:\n - collections.Counter\n - pandas\n\n Example:\n >>> myList = ['apple', 'banana', 'apple', 'cherry', 'banana', 'banana']\n >>> f_370(myList)\n Count\n apple 2\n banana 3\n cherry 1\n \"\"\"", "canonical_solution": " words = [w.lower().strip() for w in myList]\n word_counts = dict(Counter(words))\n report_df = pd.DataFrame.from_dict(word_counts, orient=\"index\", columns=[\"Count\"])\n\n return report_df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n input_data = [\"apple\", \"banana\", \"apple\", \"cherry\", \"banana\", \"banana\"]\n expected_output = pd.DataFrame(\n {\"Count\": [2, 3, 1]}, index=[\"apple\", \"banana\", \"cherry\"]\n )\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_2(self):\n # Test repeated value\n input_data = [\"apple\", \"apple\", \"apple\"]\n expected_output = pd.DataFrame({\"Count\": [3]}, index=[\"apple\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_3(self):\n # Test empty list\n input_data = []\n expected_output = pd.DataFrame(columns=[\"Count\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_4(self):\n # Test single entry\n input_data = [\"kiwi\"]\n expected_output = pd.DataFrame({\"Count\": [1]}, index=[\"kiwi\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_5(self):\n # Tests the function's ability to handle mixed case words correctly.\n input_data = [\"Apple\", \"apple\", \"APPLE\"]\n expected_output = pd.DataFrame({\"Count\": [3]}, index=[\"apple\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_6(self):\n # Tests the function's ability to handle words with leading/trailing spaces.\n input_data = [\"banana \", \" banana\", \" banana\"]\n expected_output = pd.DataFrame({\"Count\": [3]}, index=[\"banana\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_7(self):\n # Tests the function's ability to handle words with special characters.\n input_data = [\"kiwi!\", \"!kiwi\", \"kiwi\"]\n expected_output = pd.DataFrame(\n {\"Count\": [1, 1, 1]}, index=[\"kiwi!\", \"!kiwi\", \"kiwi\"]\n )\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_8(self):\n # Tests the function's handling of numeric strings as words.\n input_data = [\"123\", \"456\", \"123\", \"456\", \"789\"]\n expected_output = pd.DataFrame(\n {\"Count\": [2, 2, 1]}, index=[\"123\", \"456\", \"789\"]\n )\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_9(self):\n # Tests the function's handling of empty strings and strings with only spaces.\n input_data = [\" \", \" \", \"\", \"apple\", \"apple \"]\n expected_output = pd.DataFrame({\"Count\": [3, 2]}, index=[\"\", \"apple\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_10(self):\n # Tests handling of strings that become duplicates after strip() is applied.\n input_data = [\"banana\", \"banana \", \" banana\", \"banana\"]\n expected_output = pd.DataFrame({\"Count\": [4]}, index=[\"banana\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)", "apis": ["pandas.DataFrame.from_dict", "pandas.DataFrame", "collections.Counter"], "libs": ["pandas", "collections"], "doc": {"description": ["Count the frequency of each word in a list and return a DataFrame of words and their number."], "note": [], "params": ["myList (list): List of strings. Each string is considered a word regardless of its content,", "however the function is case insensitive, and it removes", "leading and trailing whitespaces. If empty, function returns", "a DataFrame with a Count column that is otherwise empty."], "returns": ["DataFrame: A pandas DataFrame with words and their counts."], "reqs": ["collections.Counter", "pandas"], "raises": [], "example": [">>> myList = ['apple', 'banana', 'apple', 'cherry', 'banana', 'banana']", ">>> f_370(myList)", "Count", "apple 2", "banana 3", "cherry 1"]}} -{"task_id": "f_826", "prompt": "import pandas as pd\nimport seaborn as sns\nimport numpy as np\n\n\ndef f_826(df):\n \"\"\"\n Generates a pair plot from a numeric DataFrame and calculates its covariance matrix.\n\n Parameters:\n - df (pd.DataFrame): A pandas DataFrame with only numeric columns.\n\n Returns:\n - tuple:\n - covariance_df (pd.DataFrame): The covariance matrix of the input DataFrame.\n - pair_plot (sns.axisgrid.PairGrid): Pair plot of the input DataFrame.\n\n Raises:\n - ValueError: If the DataFrame is empty.\n - TypeError: If the DataFrame contains non-numeric data types.\n\n Requirements:\n - pandas\n - numpy\n - seaborn\n\n Examples:\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n >>> covariance_df, ax = f_826(df)\n >>> type(ax)\n \n >>> covariance_df\n A B C\n A 1.0 1.0 1.0\n B 1.0 1.0 1.0\n C 1.0 1.0 1.0\n \"\"\"", "canonical_solution": " if df.empty:\n raise ValueError(\"DataFrame is empty. Non-empty DataFrame required.\")\n if not all(df.dtypes.apply(lambda x: np.issubdtype(x, np.number))):\n raise TypeError(\n \"DataFrame contains non-numeric data. Only numeric data types are supported.\"\n )\n covariance_df = df.cov()\n pair_plot = sns.pairplot(df)\n\n return covariance_df, pair_plot", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_covariance_one(self):\n \"\"\"Test basic case with expected covariance of 1.0\"\"\"\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6], \"C\": [7, 8, 9]})\n covariance_df, _ = f_826(df)\n self.assertTrue((covariance_df == 1).all().all())\n def test_identical_values_dataframe(self):\n \"\"\"Test DataFrame where all rows have identical values.\"\"\"\n df = pd.DataFrame({\"A\": [1, 1, 1], \"B\": [2, 2, 2]})\n covariance_df, _ = f_826(df)\n self.assertTrue((covariance_df == 0).all().all())\n def test_with_empty_dataframe(self):\n \"\"\"Test handling empty input (should raise error).\"\"\"\n df = pd.DataFrame()\n with self.assertRaises(ValueError):\n f_826(df)\n def test_with_non_numeric_dataframe(self):\n \"\"\"Test handling unsupported data types.\"\"\"\n df = pd.DataFrame({\"A\": [\"a\", \"b\", \"c\"], \"B\": [\"d\", \"e\", \"f\"]})\n with self.assertRaises(TypeError):\n f_826(df)\n def test_plot_attributes(self):\n \"\"\"Test plot attributes.\"\"\"\n df = pd.DataFrame({\"X\": [10, 20, 30], \"Y\": [15, 25, 35]})\n _, pair_plot = f_826(df)\n self.assertIsInstance(pair_plot, sns.axisgrid.PairGrid)\n self.assertEqual(len(pair_plot.axes), 2) # Should have 2x2 grid for pair plot\n def test_single_column_dataframe(self):\n \"\"\"Test handling of DataFrame with a single numeric column.\"\"\"\n df = pd.DataFrame({\"A\": [1, 2, 3]})\n covariance_df, _ = f_826(df)\n self.assertEqual(covariance_df.loc[\"A\"].item(), 1.0)\n self.assertEqual(covariance_df.shape, (1, 1))", "apis": ["numpy.number", "seaborn.pairplot", "numpy.issubdtype"], "libs": ["numpy", "seaborn"], "doc": {"description": ["Generates a pair plot from a numeric DataFrame and calculates its covariance matrix."], "note": [], "params": ["df (pd.DataFrame): A pandas DataFrame with only numeric columns."], "returns": ["tuple:", "covariance_df (pd.DataFrame): The covariance matrix of the input DataFrame.", "pair_plot (sns.axisgrid.PairGrid): Pair plot of the input DataFrame."], "reqs": ["pandas", "numpy", "seaborn"], "raises": ["ValueError: If the DataFrame is empty.", "TypeError: If the DataFrame contains non-numeric data types."], "example": ["Examples:", ">>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})", ">>> covariance_df, ax = f_826(df)", ">>> type(ax)", "", ">>> covariance_df", "A B C", "A 1.0 1.0 1.0", "B 1.0 1.0 1.0", "C 1.0 1.0 1.0"]}} -{"task_id": "f_382", "prompt": "import math\nimport numpy as np\nfrom datetime import datetime\nimport pandas as pd\n\n\ndef f_382(\n start_time,\n end_time,\n step,\n columns=[\"Timestamp\", \"Sensor1\", \"Sensor2\", \"Sensor3\", \"SensorStatus\"],\n sensor_statuses=[\"OK\", \"MAINTENANCE_REQUIRED\", \"ERROR\"],\n random_seed=42,\n):\n \"\"\"\n Generate a DataFrame with detailed artificial sensor readings for specified timestamps\n and sensor statuses from a predefined list.\n\n The function generates sensor readings for Sensor1, Sensor2, and Sensor3 (or their\n corresponding named columns in the supplied column list) using sine, cosine, and tan\n functions, respectively, of the timestamp (converted to seconds), with a small random\n noise added to simulate real sensor data variability.\n SensorStatus is randomly chosen from the provided statuses for each timestamp.\n\n Parameters:\n - start_time (int): Start time in milliseconds since epoch.\n - end_time (int): End time in milliseconds since epoch. Must not be before start_time.\n - step (int): The interval in milliseconds between each generated data point. Must be positive.\n This step defines the frequency at which data points are generated. If the step\n does not neatly divide the interval between start_time and end_time into\n equal-sized portions, the last timestamp may be excluded.\n - columns (list of str, optional): Names of the DataFrame columns to be included in the output.\n Defaults to: ['Timestamp', 'Sensor1', 'Sensor2', 'Sensor3', 'SensorStatus'].\n Regardless of naming, the function will populate the first column with\n timestamp, the middle columns with sensor data, and the final with status.\n - sensor_statuses (list of str, optional): Possible statuses for the sensors to randomly assign in the dataset.\n Defaults to: ['OK', 'MAINTENANCE_REQUIRED', 'ERROR'].\n - random_seed (int, optional): Seed for the random number generator to ensure reproducible results.\n Defaults to 42.\n\n Returns:\n - pd.DataFrame: Generated sensor readings for the given timestamps.\n\n Requirements:\n - math\n - datetime\n - numpy\n - pandas\n\n Example:\n >>> df = f_382(0, 5000, 1000)\n >>> type(df)\n \n >>> df.head(1)\n Timestamp Sensor1 Sensor2 Sensor3 SensorStatus\n 0 1970-01-01 00:00:00.000000 0.049671 0.986174 0.064769 ERROR\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n\n if start_time > end_time:\n raise ValueError(\"start_time cannot be after end_time\")\n if step < 0:\n raise ValueError(\"step must be positive\")\n\n timestamps = list(range(start_time, end_time, step))\n\n data = []\n for ts in timestamps:\n dt = datetime.utcfromtimestamp(ts / 1000).strftime(\"%Y-%m-%d %H:%M:%S.%f\")\n sensor1 = math.sin(ts / 1000) + np.random.normal(0, 0.1)\n sensor2 = math.cos(ts / 1000) + np.random.normal(0, 0.1)\n sensor3 = math.tan(ts / 1000) + np.random.normal(0, 0.1)\n status = np.random.choice(sensor_statuses)\n row = [dt, sensor1, sensor2, sensor3, status]\n data.append(row)\n\n return pd.DataFrame(data, columns=columns)", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n df = f_382(0, 10000, 100, random_seed=42)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(\n list(df.columns),\n [\"Timestamp\", \"Sensor1\", \"Sensor2\", \"Sensor3\", \"SensorStatus\"],\n )\n self.assertTrue(\n (df[\"SensorStatus\"].isin([\"OK\", \"MAINTENANCE_REQUIRED\", \"ERROR\"])).all()\n )\n def test_case_2(self):\n # Test custom columns\n columns = [\"Time\", \"Sensor_A\", \"Sensor_B\", \"Sensor_C\", \"Status\"]\n statuses = [\"WORKING\", \"NEEDS_CHECK\", \"FAILED\"]\n df = f_382(\n 1500, 3000, 50, columns=columns, sensor_statuses=statuses, random_seed=42\n )\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(list(df.columns), columns)\n self.assertTrue((df[\"Status\"].isin(statuses)).all())\n def test_case_3(self):\n # Test generated data integrity by comparing with expected results\n np.random.seed(42)\n ts = 0 # Using the starting timestamp for simplicity\n expected_sensor1 = math.sin(ts / 1000) + np.random.normal(0, 0.1, 1)[0]\n expected_sensor2 = math.cos(ts / 1000) + np.random.normal(0, 0.1, 1)[0]\n expected_sensor3 = math.tan(ts / 1000) + np.random.normal(0, 0.1, 1)[0]\n df = f_382(0, 100, 100, random_seed=42)\n self.assertAlmostEqual(df.iloc[0][\"Sensor1\"], expected_sensor1, places=5)\n self.assertAlmostEqual(df.iloc[0][\"Sensor2\"], expected_sensor2, places=5)\n self.assertAlmostEqual(df.iloc[0][\"Sensor3\"], expected_sensor3, places=5)\n def test_case_4(self):\n # Test handling invalid start times\n with self.assertRaises(ValueError):\n f_382(10000, 0, 100)\n def test_case_5(self):\n # Test handling incorrect end times\n with self.assertRaises(ValueError):\n f_382(1000, 900, 100)\n def test_case_6(self):\n # Test column handling\n columns = [\"Time\", \"Value1\", \"Value2\", \"Value3\", \"MachineStatus\"]\n df = f_382(0, 500, 100, columns=columns)\n self.assertEqual(list(df.columns), columns)\n # Too few/too many columns\n with self.assertRaises(ValueError):\n f_382(0, 500, 100, columns[:-1])\n with self.assertRaises(ValueError):\n f_382(0, 500, 100, columns + [\"foo\", \"bar\"])\n def test_case_7(self):\n # Test sensor status handling\n with self.assertRaises(ValueError):\n f_382(0, 500, 100, [])\n statuses = [\"RUNNING\", \"SHUTDOWN\", \"ERROR\"]\n df = f_382(0, 500, 100, sensor_statuses=statuses)\n self.assertTrue((df[\"SensorStatus\"].isin(statuses)).all())\n def test_case_8(self):\n # Test random seed\n df1 = f_382(0, 500, 100, random_seed=42)\n df2 = f_382(0, 500, 100, random_seed=42)\n pd.testing.assert_frame_equal(df1, df2)\n def test_case_9(self):\n # Test invalid steps handling\n with self.assertRaises(ValueError):\n f_382(0, 1000, -100) # Step is negative\n with self.assertRaises(ValueError):\n f_382(0, 1000, 0) # Step is zero", "apis": ["math.cos", "datetime.datetime.utcfromtimestamp", "math.tan", "numpy.random", "numpy.random.seed", "numpy.random.choice", "numpy.random.normal", "pandas.DataFrame", "math.sin"], "libs": ["numpy", "pandas", "datetime", "math"], "doc": {"description": ["Generate a DataFrame with detailed artificial sensor readings for specified timestamps", "and sensor statuses from a predefined list.", "The function generates sensor readings for Sensor1, Sensor2, and Sensor3 (or their", "corresponding named columns in the supplied column list) using sine, cosine, and tan", "functions, respectively, of the timestamp (converted to seconds), with a small random", "noise added to simulate real sensor data variability.", "SensorStatus is randomly chosen from the provided statuses for each timestamp."], "note": [], "params": ["start_time (int): Start time in milliseconds since epoch.", "end_time (int): End time in milliseconds since epoch. Must not be before start_time.", "step (int): The interval in milliseconds between each generated data point. Must be positive.", "This step defines the frequency at which data points are generated. If the step", "does not neatly divide the interval between start_time and end_time into", "equal-sized portions, the last timestamp may be excluded.", "columns (list of str, optional): Names of the DataFrame columns to be included in the output.", "Defaults to: ['Timestamp', 'Sensor1', 'Sensor2', 'Sensor3', 'SensorStatus'].", "Regardless of naming, the function will populate the first column with", "timestamp, the middle columns with sensor data, and the final with status.", "sensor_statuses (list of str, optional): Possible statuses for the sensors to randomly assign in the dataset.", "Defaults to: ['OK', 'MAINTENANCE_REQUIRED', 'ERROR'].", "random_seed (int, optional): Seed for the random number generator to ensure reproducible results.", "Defaults to 42."], "returns": ["pd.DataFrame: Generated sensor readings for the given timestamps."], "reqs": ["math", "datetime", "numpy", "pandas"], "raises": [], "example": [">>> df = f_382(0, 5000, 1000)", ">>> type(df)", "", ">>> df.head(1)", "Timestamp Sensor1 Sensor2 Sensor3 SensorStatus", "0 1970-01-01 00:00:00.000000 0.049671 0.986174 0.064769 ERROR"]}} -{"task_id": "f_762", "prompt": "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\ndef f_762(df):\n \"\"\"\n Draw and return a correlation matrix heatmap for a DataFrame containing numerical columns.\n The title of the heatmap is set to 'Correlation Matrix'.\n \n Parameters:\n df (pandas.DataFrame): The DataFrame containing numerical columns to be used for correlation.\n\n Returns:\n matplotlib.axes._subplots.AxesSubplot: The matplotlib Axes object representing the heatmap.\n\n Requirements:\n - pandas\n - seaborn\n - matplotlib.pyplot\n\n Example:\n >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n >>> ax = f_762(df)\n >>> type(ax)\n \n\n \"\"\"", "canonical_solution": " correlation_matrix = df.corr()\n ax = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')\n ax.set_title('Correlation Matrix')\n return ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')\n \n def test_case_2(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [-4, -5, -6], 'c': [-7, -8, -9]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')\n \n def test_case_3(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [-7, -8, -9]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')\n \n def test_case_4(self):\n df = pd.DataFrame({'a': [1, 1, 1], 'b': [2, 2, 2], 'c': [3, 3, 3]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')\n \n def test_case_5(self):\n df = pd.DataFrame({'a': [1, 2, None], 'b': [4, None, 6], 'c': [None, 8, 9]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')", "apis": ["seaborn.heatmap"], "libs": ["seaborn"], "doc": {"description": ["Draw and return a correlation matrix heatmap for a DataFrame containing numerical columns.", "The title of the heatmap is set to 'Correlation Matrix'."], "note": [], "params": ["df (pandas.DataFrame): The DataFrame containing numerical columns to be used for correlation."], "returns": ["matplotlib.axes._subplots.AxesSubplot: The matplotlib Axes object representing the heatmap."], "reqs": ["pandas", "seaborn", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})", ">>> ax = f_762(df)", ">>> type(ax)", ""]}} -{"task_id": "f_912", "prompt": "import warnings\nimport sqlite3\nimport pandas as pd\n\n\ndef f_912(db_path, query, warn_large_dataset=True):\n \"\"\"\n Fetches data from an SQLite database using the provided database path and SQL query.\n This function can optionally issue a warning when the dataset fetched contains more than 10,000 rows.\n\n Parameters:\n - db_path (str): The file path to the SQLite database from which data needs to be fetched.\n - query (str): The SQL query string used to retrieve data from the specified database.\n - warn_large_dataset (bool, optional): A boolean flag that, when set to True, triggers a \n warning if the retrieved dataset has more than 10,000 rows. Default is True.\n\n Returns:\n - pandas.DataFrame: A DataFrame containing the data fetched from the database.\n\n Requirements:\n - sqlite3\n - pandas\n - warnings\n\n Raises:\n - Exception: If any error occurs during database connection, SQL query execution, or data \n fetching. The error message provides details about the issue.\n\n Example:\n >>> data = f_912('/path/to/sqlite.db', 'SELECT * FROM table_name')\n >>> print(data)\n column1 column2\n 0 1 4\n 1 2 5\n 2 3 6\n \"\"\"", "canonical_solution": " if warn_large_dataset:\n warnings.simplefilter(\"always\")\n\n try:\n with sqlite3.connect(db_path) as conn:\n data = pd.read_sql_query(query, conn)\n\n if warn_large_dataset and data.shape[0] > 10000:\n warnings.warn(\"The data contains more than 10000 rows.\")\n\n return data\n\n except Exception as e:\n raise Exception(f\"Error fetching data from the database: {str(e)}\") from e", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport pandas as pd\nimport sqlite3\nimport warnings\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_912 function.\"\"\"\n def setUp(self):\n self.db_path = \"/path/to/sqlite.db\"\n self.query = \"SELECT * FROM table_name\"\n self.mock_data = pd.DataFrame({\"column1\": [1, 2, 3], \"column2\": [4, 5, 6]})\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_successful_query(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function for successful query execution.\n \"\"\"\n mock_connect.return_value.__enter__.return_value = MagicMock()\n mock_read_sql.return_value = self.mock_data\n result = f_912(self.db_path, self.query)\n print(result)\n mock_connect.assert_called_with(self.db_path)\n mock_read_sql.assert_called_with(\n self.query, mock_connect.return_value.__enter__.return_value\n )\n self.assertTrue(result.equals(self.mock_data))\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_large_dataset_warning(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function to check if it issues a warning for large datasets.\n \"\"\"\n large_data = pd.DataFrame({\"column1\": range(10001)})\n mock_read_sql.return_value = large_data\n with warnings.catch_warnings(record=True) as w:\n warnings.simplefilter(\"always\")\n f_912(self.db_path, self.query)\n self.assertEqual(len(w), 1)\n self.assertTrue(\"more than 10000 rows\" in str(w[-1].message))\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_no_warning_for_small_dataset(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function to ensure no warning for datasets smaller than 10000 rows.\n \"\"\"\n mock_read_sql.return_value = self.mock_data\n with warnings.catch_warnings(record=True) as w:\n warnings.simplefilter(\"always\")\n f_912(self.db_path, self.query)\n self.assertEqual(len(w), 0)\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_database_exception(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function to handle database connection exceptions.\n \"\"\"\n mock_connect.side_effect = sqlite3.OperationalError(\"Failed to connect\")\n with self.assertRaises(Exception) as context:\n f_912(self.db_path, self.query)\n self.assertIn(\"Error fetching data from the database\", str(context.exception))\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_sql_query_exception(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function to handle SQL query execution exceptions.\n \"\"\"\n mock_read_sql.side_effect = pd.io.sql.DatabaseError(\"Failed to execute query\")\n with self.assertRaises(Exception) as context:\n f_912(self.db_path, self.query)\n self.assertIn(\"Error fetching data from the database\", str(context.exception))", "apis": ["warnings.warn", "warnings.simplefilter", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["sqlite3", "pandas", "warnings"], "doc": {"description": ["Fetches data from an SQLite database using the provided database path and SQL query.", "This function can optionally issue a warning when the dataset fetched contains more than 10,000 rows."], "note": [], "params": ["db_path (str): The file path to the SQLite database from which data needs to be fetched.", "query (str): The SQL query string used to retrieve data from the specified database.", "warn_large_dataset (bool, optional): A boolean flag that, when set to True, triggers a", "warning if the retrieved dataset has more than 10,000 rows. Default is True."], "returns": ["pandas.DataFrame: A DataFrame containing the data fetched from the database."], "reqs": ["sqlite3", "pandas", "warnings"], "raises": ["Exception: If any error occurs during database connection, SQL query execution, or data", "fetching. The error message provides details about the issue."], "example": [">>> data = f_912('/path/to/sqlite.db', 'SELECT * FROM table_name')", ">>> print(data)", "column1 column2", "0 1 4", "1 2 5", "2 3 6"]}} -{"task_id": "f_929", "prompt": "import re\nfrom collections import Counter\nimport matplotlib.pyplot as plt\n\n\ndef f_929(text):\n \"\"\"\n Analyzes the frequency of words in a given text after lowercasing, removing punctuation, splitting into words,\n and plots the top 10 most common words.\n\n Parameters:\n - text (str): The input text to be analyzed.\n\n Returns:\n - list: A list of tuples containing the 10 most common words and their counts.\n - Axes: The matplotlib Axes object of the bar chart.\n\n Requirements:\n - re\n - collections.Counter\n - matplotlib.pyplot\n\n Example:\n >>> common_words, ax = f_929(\"This is a sample text. This text contains sample words like 'text', 'sample', and 'words'.\")\n >>> print(common_words)\n [('sample', 3), ('text', 3), ('this', 2), ('words', 2), ('is', 1), ('a', 1), ('contains', 1), ('like', 1), ('and', 1)]\n \"\"\"", "canonical_solution": " # Process text and count words\n cleaned_text = re.sub(f\"[{punctuation}]\", \"\", text).lower()\n words = cleaned_text.split()\n word_counts = Counter(words)\n most_common_words = word_counts.most_common(10)\n\n # Plotting\n _, ax = plt.subplots()\n if most_common_words: # Check if the list is not empty\n ax.bar(*zip(*most_common_words))\n else: # Handle empty case\n ax.bar([], [])\n\n return most_common_words, ax", "test": "import unittest\nfrom string import punctuation\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_929.\"\"\"\n def test_empty_text(self):\n \"\"\"\n Test the function with an empty string. Expect an empty list and a chart with no bars.\n \"\"\"\n common_words, _ = f_929(\"\")\n self.assertEqual(common_words, [])\n def test_single_word(self):\n \"\"\"\n Test the function with a text containing a single word repeated. Expect the word with its count.\n \"\"\"\n common_words, _ = f_929(\"test test test\")\n self.assertEqual(common_words, [(\"test\", 3)])\n def test_punctuation(self):\n \"\"\"\n Test the function with a text containing punctuations. Expect punctuations to be removed.\n \"\"\"\n common_words, _ = f_929(\"hello! hello, world.\")\n self.assertEqual(common_words, [(\"hello\", 2), (\"world\", 1)])\n def test_case_sensitivity(self):\n \"\"\"\n Test the function with a text containing the same word in different cases. Expect case insensitivity.\n \"\"\"\n common_words, _ = f_929(\"Hello hello HeLLo\")\n self.assertEqual(common_words, [(\"hello\", 3)])\n def test_common_scenario(self):\n \"\"\"\n Test the function with a standard sentence. Expect a correct count and ordering of words.\n \"\"\"\n text = \"This is a test. This is only a test.\"\n common_words, _ = f_929(text)\n expected = [(\"this\", 2), (\"is\", 2), (\"a\", 2), (\"test\", 2), (\"only\", 1)]\n self.assertEqual(common_words, expected)\n def tearDown(self):\n plt.close()", "apis": ["re.sub", "collections.Counter", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "collections", "re"], "doc": {"description": ["Analyzes the frequency of words in a given text after lowercasing, removing punctuation, splitting into words,", "and plots the top 10 most common words."], "note": [], "params": ["text (str): The input text to be analyzed."], "returns": ["list: A list of tuples containing the 10 most common words and their counts.", "Axes: The matplotlib Axes object of the bar chart."], "reqs": ["re", "collections.Counter", "matplotlib.pyplot"], "raises": [], "example": [">>> common_words, ax = f_929(\"This is a sample text. This text contains sample words like 'text', 'sample', and 'words'.\")", ">>> print(common_words)", "[('sample', 3), ('text', 3), ('this', 2), ('words', 2), ('is', 1), ('a', 1), ('contains', 1), ('like', 1), ('and', 1)]"]}} -{"task_id": "f_873", "prompt": "import itertools\nimport string\nimport pandas as pd\n\n\ndef f_873():\n \"\"\"\n Generate all possible combinations (with replacement) of three letters from the alphabet and save them in a pandas DataFrame.\n\n Parameters:\n - None\n\n Returns:\n - DataFrame: A pandas DataFrame with each row representing a unique combination of three letters.\n\n Requirements:\n - itertools\n - string\n - pandas\n\n Example:\n >>> df = f_873()\n >>> print(df.head())\n Letter 1 Letter 2 Letter 3\n 0 a a a\n 1 a a b\n 2 a a c\n 3 a a d\n 4 a a e\n \"\"\"", "canonical_solution": " LETTERS = list(string.ascii_lowercase)\n combinations = list(itertools.product(LETTERS, repeat=3))\n\n df = pd.DataFrame(combinations, columns=[\"Letter 1\", \"Letter 2\", \"Letter 3\"])\n\n return df", "test": "import unittest\nimport pandas as pd\nfrom itertools import product\nimport string\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_873.\"\"\"\n def test_combinations(self):\n \"\"\"\n Test if the function generates the correct combinations with replacement.\n \"\"\"\n correct_combinations = list(product(string.ascii_lowercase, repeat=3))\n result_df = f_873()\n result_combinations = [tuple(row) for row in result_df.values]\n self.assertEqual(\n result_combinations,\n correct_combinations,\n \"The combinations are not correct.\",\n )\n def test_columns(self):\n \"\"\"\n Test if the DataFrame has the correct column names.\n \"\"\"\n result_df = f_873()\n self.assertEqual(\n list(result_df.columns),\n [\"Letter 1\", \"Letter 2\", \"Letter 3\"],\n \"Column names are not correct.\",\n )\n def test_shape(self):\n \"\"\"\n Test if the shape of the DataFrame is correct.\n \"\"\"\n result_df = f_873()\n self.assertEqual(\n result_df.shape,\n (26**3, 3),\n \"Shape of the DataFrame is not correct.\",\n )\n def test_data_type(self):\n \"\"\"\n Test if all DataFrame columns contain strings.\n \"\"\"\n result_df = f_873()\n for col in result_df.columns:\n self.assertTrue(\n result_df[col].apply(lambda x: isinstance(x, str)).all(),\n f\"Column {col} does not contain all strings.\",\n )\n def test_no_duplicates(self):\n \"\"\"\n Test if there are no duplicate combinations in the DataFrame.\n \"\"\"\n result_df = f_873()\n result_combinations = [tuple(row) for row in result_df.values]\n self.assertEqual(\n len(result_combinations),\n len(set(result_combinations)),\n \"Found duplicate combinations.\",\n )", "apis": ["string.ascii_lowercase", "pandas.DataFrame", "itertools.product"], "libs": ["string", "pandas", "itertools"], "doc": {"description": ["Generate all possible combinations (with replacement) of three letters from the alphabet and save them in a pandas DataFrame."], "note": [], "params": ["None"], "returns": ["DataFrame: A pandas DataFrame with each row representing a unique combination of three letters."], "reqs": ["itertools", "string", "pandas"], "raises": [], "example": [">>> df = f_873()", ">>> print(df.head())", "Letter 1 Letter 2 Letter 3", "0 a a a", "1 a a b", "2 a a c", "3 a a d", "4 a a e"]}} -{"task_id": "f_423", "prompt": "import sqlite3\nimport pandas as pd\nimport seaborn as sns\n\n\ndef f_423(db_name=\"test.db\", table_name=\"People\"):\n \"\"\"\n Draw the age distribution of the persons in an SQLite3 table and returns the Axes object of the plot.\n Raises a ValueError if the loaded data contains negative age values.\n\n Parameters:\n db_name (str, optional): The full path to the SQLite3 database file. Defaults to 'test.db'.\n table_name (str, optional): The name of the table to plot from. Defaults to 'People'.\n\n Returns:\n matplotlib.axes._subplots.AxesSubplot: Axes object representing the age distribution plot,\n with x-axis showing age and a default of bins=30, kde=True.\n\n Requirements:\n - sqlite3\n - pandas\n - seaborn\n\n Examples:\n >>> ax = f_423('path/to/test.db', 'People')\n >>> type(ax)\n \n >>> ax = f_423()\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " conn = sqlite3.connect(db_name)\n df = pd.read_sql_query(f\"SELECT age from {table_name}\", conn)\n\n if (df[\"age\"] < 0).any():\n raise ValueError(\"Data contains negative age values.\")\n\n ax = sns.histplot(data=df, x=\"age\", bins=30, kde=True)\n ax.set_xlabel(\"age\")\n return ax", "test": "import unittest\nimport os\nimport sqlite3\nimport matplotlib.pyplot as plt\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Setup temporary directory\n self.test_dir = tempfile.TemporaryDirectory()\n # Create test_alt.db with People table\n self.alt_db_path = os.path.join(self.test_dir.name, \"test_alt.db\")\n conn = sqlite3.connect(self.alt_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE People (name TEXT, age INT)\")\n cursor.executemany(\n \"INSERT INTO People VALUES (?, ?)\", [(\"Alice\", 25), (\"Bob\", 30)]\n )\n conn.commit()\n conn.close()\n # Create a standard test.db with Employees table\n self.default_db_path = os.path.join(self.test_dir.name, \"test.db\")\n conn = sqlite3.connect(self.default_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE Employees (name TEXT, age INT)\")\n cursor.executemany(\n \"INSERT INTO Employees VALUES (?, ?)\", [(\"Charlie\", 35), (\"David\", 40)]\n )\n conn.commit()\n conn.close()\n # Create standard db with more examples\n self.multiple_db_path = os.path.join(self.test_dir.name, \"test_multiple.db\")\n conn = sqlite3.connect(self.multiple_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE MultipleAge (name TEXT, age INT)\")\n cursor.executemany(\n \"INSERT INTO MultipleAge VALUES (?, ?)\",\n [(\"Alice\", 25), (\"Bob\", 30), (\"Charlie\", 35)],\n )\n conn.commit()\n conn.close()\n # Create a db for testing edge cases - negative age\n self.negative_age_db_path = os.path.join(\n self.test_dir.name, \"test_negative_age.db\"\n )\n conn = sqlite3.connect(self.negative_age_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE NegativeAge (name TEXT, age INT)\")\n cursor.executemany(\n \"INSERT INTO NegativeAge VALUES (?, ?)\", [(\"Eve\", -1), (\"Frank\", 20)]\n )\n conn.commit()\n conn.close()\n # Create a db for testing edge cases - empty\n self.empty_db_path = os.path.join(self.test_dir.name, \"test_empty.db\")\n conn = sqlite3.connect(self.empty_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE EmptyAge (name TEXT, age INT)\")\n conn.commit()\n conn.close()\n def tearDown(self):\n self.test_dir.cleanup()\n plt.close(\"all\")\n def _check_plot(self, ax, contains_data=True):\n self.assertTrue(isinstance(ax, plt.Axes), \"The plot should be an Axes object.\")\n self.assertEqual(ax.get_xlabel(), \"age\", \"The x-axis label should be 'age'.\")\n if contains_data:\n self.assertTrue(len(ax.lines) > 0, \"The plot should contain a KDE line.\")\n def test_case_1(self):\n ax = f_423(db_name=self.default_db_path, table_name=\"Employees\")\n self._check_plot(ax)\n def test_case_2(self):\n ax = f_423(db_name=self.alt_db_path)\n self._check_plot(ax)\n def test_case_3(self):\n ax = f_423(db_name=self.default_db_path, table_name=\"Employees\")\n self._check_plot(ax)\n def test_case_4(self):\n ax = f_423(db_name=self.multiple_db_path, table_name=\"MultipleAge\")\n self._check_plot(ax)\n def test_case_5(self):\n ax = f_423(db_name=self.empty_db_path, table_name=\"EmptyAge\")\n self._check_plot(ax, False)\n def test_case_6(self):\n # Test for non-existent table\n with self.assertRaises(Exception):\n f_423(db_name=self.default_db_path, table_name=\"Nonexistent\")\n def test_case_7(self):\n # Test for negative age values\n with self.assertRaises(ValueError):\n f_423(db_name=self.negative_age_db_path, table_name=\"NegativeAge\")", "apis": ["seaborn.histplot", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["sqlite3", "pandas", "seaborn"], "doc": {"description": ["Draw the age distribution of the persons in an SQLite3 table and returns the Axes object of the plot.", "Raises a ValueError if the loaded data contains negative age values."], "note": [], "params": ["db_name (str, optional): The full path to the SQLite3 database file. Defaults to 'test.db'.", "table_name (str, optional): The name of the table to plot from. Defaults to 'People'."], "returns": ["matplotlib.axes._subplots.AxesSubplot: Axes object representing the age distribution plot,", "with x-axis showing age and a default of bins=30, kde=True."], "reqs": ["sqlite3", "pandas", "seaborn"], "raises": [], "example": ["Examples:", ">>> ax = f_423('path/to/test.db', 'People')", ">>> type(ax)", "", ">>> ax = f_423()", ">>> type(ax)", ""]}} -{"task_id": "f_766", "prompt": "import pandas as pd\nimport os\nimport sys\n\ndef f_766(file_path: str, column_name: str) -> pd.DataFrame:\n \"\"\"\n Load a CSV file into a Pandas DataFrame, replace all occurrences of the string '\\n' with the string '
'\n in the specified column, and return the cleaned DataFrame.\n \n Parameters:\n - file_path (str): The path to the CSV file to be read.\n - column_name (str): The name of the column in which to replace occurrences of '\\n' with '
'.\n \n Returns:\n - pd.DataFrame: The cleaned Pandas DataFrame.\n \n Requirements:\n - pandas\n - os\n - sys\n \n Examples:\n >>> df = f_766('data.csv', 'Value')\n >>> print(df['Value'].iloc[0])\n \"some
text\"\n >>> df = f_766('another_data.csv', 'Comments')\n >>> print(df['Comments'].iloc[1])\n \"hello
world\"\n \"\"\"", "canonical_solution": " if not os.path.exists(file_path):\n print(f'File does not exist: {file_path}')\n sys.exit(1)\n\n df = pd.read_csv(file_path)\n \n # Check if the column exists\n if column_name in df.columns:\n df[column_name] = df[column_name].replace({'\\n': '
'}, regex=True)\n else:\n print(f\"Column '{column_name}' does not exist in the DataFrame. No changes were made.\")\n\n return df", "test": "import unittest\nimport pandas as pd\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n os.mkdir('test')\n data = {\n 'ID': [1, 2, 3],\n 'Value': [\"Hello\\nWorld\", \"Python\\nis\\nawesome\", \"No newlines here\"]\n }\n df = pd.DataFrame(data)\n df.to_csv('test/test_data_1.csv', index=False)\n data = {\n 'ID': [1, 2],\n 'Comments': [\"Good\\nMorning\", \"Happy\\nCoding\"]\n }\n df = pd.DataFrame(data)\n df.to_csv('test/test_data_2.csv', index=False)\n data = {\n 'ID': [1, 2],\n 'Text': [\"Line 1\", \"Line 2\\nLine 3\"]\n }\n df = pd.DataFrame(data)\n df.to_csv('test/test_data_3.csv', index=False)\n def tearDown(self):\n os.remove('test/test_data_1.csv')\n os.remove('test/test_data_2.csv')\n os.remove('test/test_data_3.csv')\n os.rmdir('test')\n def test_case_1(self):\n df = f_766('test/test_data_1.csv', 'Value')\n self.assertEqual(df['Value'].iloc[0], \"Hello
World\")\n self.assertEqual(df['Value'].iloc[1], \"Python
is
awesome\")\n self.assertEqual(df['Value'].iloc[2], \"No newlines here\")\n \n def test_case_2(self):\n df = f_766('test/test_data_2.csv', 'Comments')\n self.assertEqual(df['Comments'].iloc[0], \"Good
Morning\")\n self.assertEqual(df['Comments'].iloc[1], \"Happy
Coding\")\n \n def test_case_3(self):\n df = f_766('test/test_data_3.csv', 'Text')\n self.assertEqual(df['Text'].iloc[0], \"Line 1\")\n self.assertEqual(df['Text'].iloc[1], \"Line 2
Line 3\")\n \n def test_case_4(self):\n df1 = f_766('test/test_data_1.csv', 'Value')\n df2 = f_766('test/test_data_1.csv', '')\n self.assertEqual(df1['Value'].iloc[0], \"Hello
World\")\n self.assertEqual(df2['Value'].iloc[0], \"Hello\\nWorld\")\n \n def test_case_5(self):\n df1 = f_766('test/test_data_1.csv', 'Value')\n df2 = f_766('test/test_data_1.csv', 'NonExistentColumn')\n self.assertEqual(df1['Value'].iloc[0], \"Hello
World\")\n self.assertEqual(df2['Value'].iloc[0], \"Hello\\nWorld\")", "apis": ["os.path.exists", "pandas.read_csv", "os.path", "sys.exit", "pandas.DataFrame"], "libs": ["pandas", "os", "sys"], "doc": {"description": ["Load a CSV file into a Pandas DataFrame, replace all occurrences of the string '\\n' with the string '
'", "in the specified column, and return the cleaned DataFrame."], "note": [], "params": ["file_path (str): The path to the CSV file to be read.", "column_name (str): The name of the column in which to replace occurrences of '\\n' with '
'."], "returns": ["pd.DataFrame: The cleaned Pandas DataFrame."], "reqs": ["pandas", "os", "sys"], "raises": [], "example": ["Examples:", ">>> df = f_766('data.csv', 'Value')", ">>> print(df['Value'].iloc[0])", "\"some
text\"", ">>> df = f_766('another_data.csv', 'Comments')", ">>> print(df['Comments'].iloc[1])", "\"hello
world\""]}} -{"task_id": "f_386", "prompt": "from datetime import datetime\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_386(start_time, end_time, step, amplitude, period, seed=0):\n \"\"\"\n Generate a time series with a given seasonality from the start time to the end time\n with a given step, and plot the time series with the seasonality.\n\n Parameters:\n - start_time (int): The start epoch time in milliseconds.\n = end_time (int): The end epoch time in milliseconds.\n - step (int): The step in milliseconds between each data point. Must be at least 1.\n - amplitude (float): The amplitude of the seasonality.\n - period (int): The period of the seasonality in milliseconds. Must be at least 0.\n - seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n plt.Axes: A plot of the generated 'Time Series with Seasonality',\n with 'Timestamp' on x-axis and 'Value' on y-axis.\n\n Requirements:\n - datetime.datetime\n - pandas\n - numpy\n\n Example:\n >>> ax = f_386(0, 10000, 100, 1, 1000)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-20.0, 0, '1970-01-01 10:00:08.000000'), Text(0.0, 0, '1970-01-01 10:00:00.000000'), Text(20.0, 0, '1970-01-01 10:00:02.000000'), Text(40.0, 0, '1970-01-01 10:00:04.000000'), Text(60.0, 0, '1970-01-01 10:00:06.000000'), Text(80.0, 0, '1970-01-01 10:00:08.000000'), Text(100.0, 0, ''), Text(120.0, 0, '')]\n \"\"\"", "canonical_solution": " np.random.seed(seed)\n\n if period <= 0 or step < 1:\n raise ValueError(\"Invalid input values\")\n\n COLUMNS = [\"Timestamp\", \"Value\"]\n\n timestamps = np.arange(start_time, end_time, step)\n df = pd.DataFrame(columns=COLUMNS)\n\n if amplitude == 0:\n values = [0] * len(timestamps)\n else:\n values = np.random.normal(size=len(timestamps))\n\n data = []\n for i, ts in enumerate(timestamps):\n dt = datetime.fromtimestamp(ts / 1000).strftime(\"%Y-%m-%d %H:%M:%S.%f\")\n value = values[i] + amplitude * np.sin(2 * np.pi * ts / period)\n data.append([dt, value])\n\n df = pd.DataFrame(data, columns=COLUMNS)\n\n ax = df.plot(x=\"Timestamp\", y=\"Value\", title=\"Time Series with Seasonality\")\n ax.set_ylabel(\"Value\")\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic properties\n test_cases = [\n (0, 10000, 100, 1, 1000),\n (0, 100000, 1000, 2, 5000),\n (0, 10000, 100, 0.5, 1000),\n (0, 10000, 100, 1, 500),\n (0, 10000, 500, 1, 1000),\n ]\n for start_time, end_time, step, amplitude, period in test_cases:\n with self.subTest(\n start_time=start_time,\n end_time=end_time,\n step=step,\n amplitude=amplitude,\n period=period,\n ):\n ax = f_386(start_time, end_time, step, amplitude, period)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Time Series with Seasonality\")\n self.assertEqual(ax.get_xlabel(), \"Timestamp\")\n self.assertEqual(ax.get_ylabel(), \"Value\")\n def test_case_2(self):\n # Test large step\n # Plot should still behave as expected even when step > (end_time - start_time)\n ax = f_386(0, 10000, 200000, 1, 1000)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Time Series with Seasonality\")\n self.assertEqual(ax.get_xlabel(), \"Timestamp\")\n self.assertEqual(ax.get_ylabel(), \"Value\")\n def test_case_3(self):\n # Test handling invalid input types - period\n with self.assertRaises(ValueError):\n f_386(0, 10000, 100, 1, 0)\n with self.assertRaises(ValueError):\n f_386(0, 10000, 100, 1, -1)\n def test_case_4(self):\n # Test handling invalid input types - step\n with self.assertRaises(ValueError):\n f_386(0, 10000, -100, 1, 1000)\n with self.assertRaises(ValueError):\n f_386(0, 10000, 0, 1, 1000)\n def test_case_5(self):\n # Test plot data integrity\n ax = f_386(0, 10000, 100, 1, 1000)\n xy_data = ax.get_lines()[0].get_xydata()\n expected_length = (10000 - 0) // 100\n self.assertEqual(len(xy_data), expected_length)\n def test_case_6(self):\n # Test random seed\n ax1 = f_386(0, 10000, 100, 1, 1000, seed=42)\n xy_data1 = ax1.get_lines()[0].get_xydata()\n ax2 = f_386(0, 10000, 100, 1, 1000, seed=42)\n xy_data2 = ax2.get_lines()[0].get_xydata()\n ax3 = f_386(0, 10000, 100, 1, 1000, seed=43)\n xy_data3 = ax3.get_lines()[0].get_xydata()\n self.assertTrue(\n np.array_equal(xy_data1, xy_data2),\n \"Results should be the same with the same seed\",\n )\n self.assertFalse(\n np.array_equal(xy_data1, xy_data3),\n \"Results should be different with different seeds\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.arange", "numpy.random", "numpy.sin", "numpy.pi", "numpy.random.seed", "datetime.datetime.fromtimestamp", "numpy.random.normal", "pandas.DataFrame"], "libs": ["numpy", "pandas", "datetime"], "doc": {"description": ["Generate a time series with a given seasonality from the start time to the end time", "with a given step, and plot the time series with the seasonality."], "note": [], "params": ["start_time (int): The start epoch time in milliseconds.", "= end_time (int): The end epoch time in milliseconds.", "step (int): The step in milliseconds between each data point. Must be at least 1.", "amplitude (float): The amplitude of the seasonality.", "period (int): The period of the seasonality in milliseconds. Must be at least 0.", "seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["plt.Axes: A plot of the generated 'Time Series with Seasonality',", "with 'Timestamp' on x-axis and 'Value' on y-axis."], "reqs": ["datetime.datetime", "pandas", "numpy"], "raises": [], "example": [">>> ax = f_386(0, 10000, 100, 1, 1000)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-20.0, 0, '1970-01-01 10:00:08.000000'), Text(0.0, 0, '1970-01-01 10:00:00.000000'), Text(20.0, 0, '1970-01-01 10:00:02.000000'), Text(40.0, 0, '1970-01-01 10:00:04.000000'), Text(60.0, 0, '1970-01-01 10:00:06.000000'), Text(80.0, 0, '1970-01-01 10:00:08.000000'), Text(100.0, 0, ''), Text(120.0, 0, '')]"]}} -{"task_id": "f_907", "prompt": "from matplotlib import pyplot as plt\nfrom sklearn.decomposition import PCA\n\n\ndef f_907(arr):\n \"\"\"\n Performs Principal Component Analysis (PCA) on the sum of rows of a 2D numpy array and plots the explained variance ratio.\n\n Note:\n - The title of the plot is set to \"Explained Variance Ratio of Principal Components\".\n\n Parameters:\n - arr (numpy.ndarray): A 2D numpy array. The input data for PCA.\n\n Returns:\n - ax (matplotlib.axes.Axes): An Axes object from matplotlib.\n\n Requirements:\n - scikit-learn\n - matplotlib\n\n Notes:\n - The function assumes that 'arr' is a valid 2D numpy array.\n - Only the first principal component is considered in this analysis.\n - The plot illustrates the proportion of the dataset's variance that lies along the axis of this first principal component.\n \n Example:\n >>> import numpy as np\n >>> arr = np.array([[i+j for i in range(3)] for j in range(5)])\n >>> axes = f_907(arr)\n >>> axes.get_title()\n 'Explained Variance Ratio of Principal Components'\n \"\"\"", "canonical_solution": " row_sums = arr.sum(axis=1)\n pca = PCA(n_components=1)\n pca.fit(row_sums.reshape(-1, 1))\n\n # Plotting (requires matplotlib and sklearn)\n\n _, ax = plt.subplots()\n ax.bar([0], pca.explained_variance_ratio_)\n ax.set_title(\"Explained Variance Ratio of Principal Components\")\n ax.set_xticks([0])\n ax.set_xticklabels([\"PC1\"])\n\n return ax", "test": "import unittest\nimport numpy as np\nfrom sklearn.decomposition import PCA\nfrom matplotlib import pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for function f_907.\"\"\"\n def test_basic_functionality(self):\n \"\"\"Test basic functionality of f_907.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n result = f_907(arr)\n self.assertIsInstance(result, plt.Axes)\n def test_plot_title_verification(self):\n \"\"\"Test that the plot title is correct.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n result = f_907(arr)\n self.assertEqual(\n result.get_title(), \"Explained Variance Ratio of Principal Components\"\n )\n def test_bar_count_verification(self):\n \"\"\"Test that the number of bars is correct.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n result = f_907(arr)\n n_components = min(2, arr.sum(axis=1).reshape(-1, 1).shape[1])\n self.assertEqual(len(result.patches), n_components)\n def test_variance_ratios_verification(self):\n \"\"\"Test that the variance ratios are correct.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n row_sums = arr.sum(axis=1)\n n_components = min(2, row_sums.reshape(-1, 1).shape[1])\n pca = PCA(n_components=n_components)\n pca.fit(row_sums.reshape(-1, 1))\n result = f_907(arr)\n for bar, variance_ratio in zip(result.patches, pca.explained_variance_ratio_):\n self.assertAlmostEqual(bar.get_height(), variance_ratio)\n def test_empty_input(self):\n \"\"\"Test that an empty input raises a ValueError.\"\"\"\n arr = np.array([])\n with self.assertRaises(ValueError):\n f_907(arr)", "apis": ["matplotlib.pyplot.subplots", "sklearn.decomposition.PCA"], "libs": ["matplotlib", "sklearn"], "doc": {"description": ["Performs Principal Component Analysis (PCA) on the sum of rows of a 2D numpy array and plots the explained variance ratio.", "Notes:", "- The function assumes that 'arr' is a valid 2D numpy array.", "- Only the first principal component is considered in this analysis.", "- The plot illustrates the proportion of the dataset's variance that lies along the axis of this first principal component."], "note": ["The title of the plot is set to \"Explained Variance Ratio of Principal Components\"."], "params": ["arr (numpy.ndarray): A 2D numpy array. The input data for PCA."], "returns": ["ax (matplotlib.axes.Axes): An Axes object from matplotlib."], "reqs": ["scikit-learn", "matplotlib"], "raises": [], "example": [">>> import numpy as np", ">>> arr = np.array([[i+j for i in range(3)] for j in range(5)])", ">>> axes = f_907(arr)", ">>> axes.get_title()", "'Explained Variance Ratio of Principal Components'"]}} -{"task_id": "f_815", "prompt": "import pathlib\nimport os\n\n\ndef f_815(path: str, delimiter: str = os.path.sep) -> list:\n \"\"\"\n Validates that a given file path does not contain invalid characters for file paths\n then splits it into path components using a specified delimiter.\n\n Parameters:\n - path (str): The file path to split. If empty, the function returns an empty list.\n - delimiter (str): The delimiter to use for splitting the path.\n Defaults to the system's path separator (os.path.sep).\n\n Returns:\n - list: A list of the path components if the path is valid;\n otherwise, an empty list if the path contains invalid characters.\n\n Raises:\n - ValueError: If the path contains invalid characters.\n\n Requirements:\n - pathlib\n - os\n\n Notes:\n - Backslashes ('\\\\') are internally converted to forward slashes ('/') before processing.\n - This function treats '<', '>', ':', '\"', '|', '?', '*' as invalid characters in paths.\n\n Examples:\n >>> f_815('Docs/src/Scripts/temp', '/')\n ['Docs', 'src', 'Scripts', 'temp']\n >>> f_815(r'Docs\\\\src\\\\Scripts\\\\temp', '\\\\\\\\')\n ['Docs', 'src', 'Scripts', 'temp']\n \"\"\"", "canonical_solution": "\n if not path:\n return []\n\n path = path.replace(\"\\\\\", \"/\")\n\n path_obj = pathlib.Path(path)\n\n invalid_chars = set('<>:\"|?*')\n if any(\n set(str(component)).intersection(invalid_chars) for component in path_obj.parts\n ):\n return []\n\n return [\n component\n for component in path_obj.parts\n if component and component != delimiter\n ]", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Testing a standard UNIX-like path with '/' delimiter\n self.assertEqual(\n f_815(\"Docs/src/Scripts/temp\", \"/\"),\n [\"Docs\", \"src\", \"Scripts\", \"temp\"],\n )\n def test_case_2(self):\n # Testing a standard Windows-like path with '\\' delimiter\n self.assertEqual(\n f_815(\"Docs\\\\src\\\\Scripts\\\\temp\", \"\\\\\"),\n [\"Docs\", \"src\", \"Scripts\", \"temp\"],\n )\n def test_case_3(self):\n # Testing an empty path string\n self.assertEqual(f_815(\"\", \"/\"), [])\n def test_case_4(self):\n # Testing a path with invalid characters\n self.assertEqual(f_815(\"Docs/src/Scripts|temp\", \"/\"), [])\n def test_case_5(self):\n # Testing a path with a different delimiter\n self.assertEqual(f_815(\"Docs|src|Scripts|temp\", \"|\"), [])\n def test_case_6(self):\n # Handle leading and trailing delimiters\n self.assertEqual(f_815(\"/Docs/src/Scripts/\", \"/\"), [\"Docs\", \"src\", \"Scripts\"])\n def test_case_7(self):\n # Test mixed delimiters given expected conversion\n self.assertEqual(\n f_815(\"Docs/src\\\\Scripts/temp\", \"\\\\\"), [\"Docs\", \"src\", \"Scripts\", \"temp\"]\n )\n self.assertEqual(\n f_815(\"Docs/src\\\\Scripts/temp\", \"/\"), [\"Docs\", \"src\", \"Scripts\", \"temp\"]\n )", "apis": ["pathlib.Path", "os.path"], "libs": ["os", "pathlib"], "doc": {"description": ["Validates that a given file path does not contain invalid characters for file paths", "then splits it into path components using a specified delimiter.", "Notes:", "- Backslashes ('\\\\') are internally converted to forward slashes ('/') before processing.", "- This function treats '<', '>', ':', '\"', '|', '?', '*' as invalid characters in paths."], "note": [], "params": ["path (str): The file path to split. If empty, the function returns an empty list.", "delimiter (str): The delimiter to use for splitting the path.", "Defaults to the system's path separator (os.path.sep)."], "returns": ["list: A list of the path components if the path is valid;", "otherwise, an empty list if the path contains invalid characters."], "reqs": ["pathlib", "os"], "raises": ["ValueError: If the path contains invalid characters."], "example": ["Examples:", ">>> f_815('Docs/src/Scripts/temp', '/')", "['Docs', 'src', 'Scripts', 'temp']", ">>> f_815(r'Docs\\\\src\\\\Scripts\\\\temp', '\\\\\\\\')", "['Docs', 'src', 'Scripts', 'temp']"]}} -{"task_id": "f_876", "prompt": "import itertools\nimport string\nimport pandas as pd\n\n\ndef f_876():\n \"\"\"\n Generate all possible 3-letter combinations of the alphabet, save them in a pandas DataFrame,\n and draw a histogram of the frequency of the first letters in these combinations.\n\n This function uses itertools.product to create all possible combinations of three letters.\n It then creates a DataFrame from these combinations and plots a histogram to show the frequency\n of each letter appearing as the first letter in these combinations.\n\n Parameters:\n - None\n\n Returns:\n tuple: A tuple containing:\n - DataFrame: A pandas DataFrame with all 3-letter combinations.\n - Axes: A matplotlib Axes object representing the histogram plot.\n\n Requirements:\n - itertools\n - string\n - pandas\n\n Example:\n >>> df, ax = f_876()\n >>> print(df.head())\n a b c\n 0 a a a\n 1 a a b\n 2 a a c\n 3 a a d\n 4 a a e\n \"\"\"", "canonical_solution": " LETTERS = list(string.ascii_lowercase)\n combinations = list(itertools.product(LETTERS, repeat=3))\n df = pd.DataFrame(combinations, columns=[\"a\", \"b\", \"c\"])\n\n # Getting value counts and ensuring the correct order of letters\n value_counts = df[\"a\"].value_counts().reindex(LETTERS, fill_value=0)\n\n # Plotting the histogram with the correct order\n ax = value_counts.plot(kind=\"bar\")\n\n return df, ax", "test": "import unittest\nimport itertools\nimport string\nimport matplotlib.pyplot as plt\nLETTERS = list(string.ascii_lowercase)\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_876\"\"\"\n def test_dataframe_shape(self):\n \"\"\"\n Test if the DataFrame has the correct shape (17576 rows, 3 columns)\n \"\"\"\n df, _ = f_876()\n self.assertEqual(df.shape, (17576, 3))\n def test_dataframe_columns(self):\n \"\"\"\n Test if the DataFrame has the correct column names (a, b, c)\n \"\"\"\n df, _ = f_876()\n self.assertListEqual(list(df.columns), [\"a\", \"b\", \"c\"])\n def test_histogram_plot(self):\n \"\"\"\n Test if the histogram plot is an instance of matplotlib Axes\n \"\"\"\n _, ax = f_876()\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_first_column_values(self):\n \"\"\"\n Test if the first column of the DataFrame contains only lowercase letters\n \"\"\"\n df, _ = f_876()\n self.assertTrue(all(letter in string.ascii_lowercase for letter in df[\"a\"]))\n def test_no_empty_values(self):\n \"\"\"\n Test if there are no empty values in the DataFrame\n \"\"\"\n df, _ = f_876()\n self.assertFalse(df.isnull().values.any())\n def tearDown(self):\n plt.close()", "apis": ["string.ascii_lowercase", "pandas.DataFrame", "itertools.product"], "libs": ["string", "pandas", "itertools"], "doc": {"description": ["Generate all possible 3-letter combinations of the alphabet, save them in a pandas DataFrame,", "and draw a histogram of the frequency of the first letters in these combinations.", "This function uses itertools.product to create all possible combinations of three letters.", "It then creates a DataFrame from these combinations and plots a histogram to show the frequency", "of each letter appearing as the first letter in these combinations."], "note": [], "params": ["None"], "returns": ["tuple: A tuple containing:", "DataFrame: A pandas DataFrame with all 3-letter combinations.", "Axes: A matplotlib Axes object representing the histogram plot."], "reqs": ["itertools", "string", "pandas"], "raises": [], "example": [">>> df, ax = f_876()", ">>> print(df.head())", "a b c", "0 a a a", "1 a a b", "2 a a c", "3 a a d", "4 a a e"]}} -{"task_id": "f_586", "prompt": "import pandas as pd\nfrom sklearn.linear_model import LinearRegression\n\ndef f_586(df, target):\n \"\"\"\n Perform a linear regression analysis on a given DataFrame.\n \n Parameters:\n - df (pd.DataFrame): The pandas DataFrame.\n - target (str): The target variable.\n \n Returns:\n - score (float): The R-squared score of the model.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> import numpy as np\n >>> np.random.seed(42)\n >>> df = pd.DataFrame({'feature': np.random.rand(100), 'target': np.random.rand(100)}) # Explicitly using pd\n >>> r_squared = f_586(df, 'target')\n >>> print(r_squared)\n 0.0011582111228732872\n \"\"\"", "canonical_solution": " X = pd.DataFrame.drop(df, target, axis=1) \n y = pd.Series(df[target]) \n \n model = LinearRegression()\n model.fit(X, y)\n\n return model.score(X, y)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame([[0, 1, 2], [3, 4, 5], [6, 7, 8]], columns = ['x', 'y', 'z'])\n r_squared = f_586(df, 'z')\n self.assertEqual(r_squared, 1.0)\n \n def test_case_2(self):\n df = pd.DataFrame([[-1, 1, 2], [3, 4, 5], [6, 7, 8]], columns = ['x', 'y', 'z'])\n r_squared = f_586(df, 'z')\n self.assertEqual(r_squared, 1.0)\n \n def test_case_3(self):\n df = pd.DataFrame([[0, 0, 0], [1, 1, 1], [2, 2, 2]], columns = ['x', 'y', 'z'])\n r_squared = f_586(df, 'z')\n self.assertEqual(r_squared, 1.0)\n def test_case_4(self):\n df = pd.DataFrame([[0, 0, 9], [1, 1, 35], [2, 2, 78]], columns = ['x', 'y', 'z'])\n r_squared = f_586(df, 'z')\n self.assertFalse(r_squared == 1.0)\n def test_case_5(self):\n df = pd.DataFrame([[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], columns = ['x', 'y', 'z', 'w'])\n r_squared = f_586(df, 'w')\n self.assertEqual(r_squared, 1.0)", "apis": ["pandas.DataFrame.drop", "pandas.DataFrame", "sklearn.linear_model.LinearRegression", "pandas.Series"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Perform a linear regression analysis on a given DataFrame."], "note": [], "params": ["df (pd.DataFrame): The pandas DataFrame.", "target (str): The target variable."], "returns": ["score (float): The R-squared score of the model."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> import numpy as np", ">>> np.random.seed(42)", ">>> df = pd.DataFrame({'feature': np.random.rand(100), 'target': np.random.rand(100)}) # Explicitly using pd", ">>> r_squared = f_586(df, 'target')", ">>> print(r_squared)", "0.0011582111228732872"]}} +{"task_id": "f_844", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_844(csv_file_path: str):\n \"\"\"\n This function reads data from a CSV file, normalizes a specific column named 'column1', and then plots the normalized data.\n\n - The title is created using Python's string formatting, aligning 'Plot Title' and 'Normalized Column 1' on either side of a \n colon, each padded to 20 characters.\n - Similarly, the x-label is formatted with 'Index' and 'Normalized Value' on either side of a colon, \n each padded to 20 characters.\n - The y-label is set in the same manner, with 'Frequency' and 'Normalized Value' on either side of a colon.\n\n Parameters:\n - csv_file_path (str): Path to the CSV file. The file must contain a column named 'column1'.\n\n Returns:\n - The matplotlib.axes.Axes object with the plot of the normalized data.\n\n Requirements:\n - pandas\n - matplotlib\n\n Example:\n >>> ax = f_844('data.csv')\n >>> ax.get_title()\n 'Plot Title : Normalized Column 1'\n \"\"\"", "canonical_solution": " df = pd.read_csv(csv_file_path)\n mean = df[\"column1\"].mean()\n std = df[\"column1\"].std()\n df[\"column1_normalized\"] = (df[\"column1\"] - mean) / std\n\n # Creating a figure and axes\n _, ax = plt.subplots()\n # Plotting on the created axes\n ax.plot(df[\"column1_normalized\"])\n title = \"%*s : %*s\" % (20, \"Plot Title\", 20, \"Normalized Column 1\")\n xlabel = \"%*s : %*s\" % (20, \"Index\", 20, \"Normalized Value\")\n ylabel = \"%*s : %*s\" % (20, \"Frequency\", 20, \"Normalized Value\")\n ax.set_title(title)\n ax.set_xlabel(xlabel)\n ax.set_ylabel(ylabel)\n\n # Return the axes object for further manipulation\n return ax", "test": "import unittest\nfrom unittest.mock import patch\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_844 function.\"\"\"\n @patch(\"pandas.read_csv\")\n def test_title_format(self, mock_read_csv):\n \"\"\"Test that the function returns the correct title.\"\"\"\n # Mocking the DataFrame\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n expected_title = \" Plot Title : Normalized Column 1\"\n self.assertEqual(ax.get_title(), expected_title)\n @patch(\"pandas.read_csv\")\n def test_xlabel_format(self, mock_read_csv):\n \"\"\"Test that the function returns the correct xlabel.\"\"\"\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n expected_xlabel = \" Index : Normalized Value\"\n self.assertEqual(ax.get_xlabel(), expected_xlabel)\n @patch(\"pandas.read_csv\")\n def test_ylabel_format(self, mock_read_csv):\n \"\"\"Test that the function returns the correct ylabel.\"\"\"\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n expected_ylabel = \" Frequency : Normalized Value\"\n self.assertEqual(ax.get_ylabel(), expected_ylabel)\n @patch(\"pandas.read_csv\")\n def test_data_points_length(self, mock_read_csv):\n \"\"\"Test that the function returns the correct number of data points.\"\"\"\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n line = ax.get_lines()[0]\n self.assertEqual(len(line.get_data()[1]), 10)\n @patch(\"pandas.read_csv\")\n def test_data_points_range(self, mock_read_csv):\n \"\"\"Test that the function returns the correct data points.\"\"\"\n mock_data = pd.DataFrame({\"column1\": np.random.rand(10)})\n mock_read_csv.return_value = mock_data\n ax = f_844(\"dummy_path\")\n line = ax.get_lines()[0]\n data_points = line.get_data()[1]\n self.assertTrue(all(-3 <= point <= 3 for point in data_points))\n def tearDown(self):\n plt.clf()", "apis": ["pandas.read_csv", "matplotlib.pyplot.subplots"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["This function reads data from a CSV file, normalizes a specific column named 'column1', and then plots the normalized data.", "- The title is created using Python's string formatting, aligning 'Plot Title' and 'Normalized Column 1' on either side of a", "colon, each padded to 20 characters.", "- Similarly, the x-label is formatted with 'Index' and 'Normalized Value' on either side of a colon,", "each padded to 20 characters.", "- The y-label is set in the same manner, with 'Frequency' and 'Normalized Value' on either side of a colon."], "note": [], "params": ["csv_file_path (str): Path to the CSV file. The file must contain a column named 'column1'."], "returns": ["The matplotlib.axes.Axes object with the plot of the normalized data."], "reqs": ["pandas", "matplotlib"], "raises": [], "example": [">>> ax = f_844('data.csv')", ">>> ax.get_title()", "'Plot Title : Normalized Column 1'"]}} +{"task_id": "f_348", "prompt": "import numpy as np\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\n\n\ndef f_348(\n P: np.ndarray,\n T: np.ndarray,\n n_clusters: int = 3,\n random_state: int = 0,\n n_init: int = 10,\n) -> (np.ndarray, plt.Axes):\n \"\"\"\n Calculate the product of a matrix 'P' and a 3D tensor 'T', flatten the result,\n apply KMeans clustering to the flattened data, and visualize it.\n\n Parameters:\n P (numpy.ndarray): The input matrix.\n T (numpy.ndarray): The input tensor with shape (3, 3, 3).\n n_clusters (int): The number of clusters for KMeans clustering. Default is 3.\n random_state (int): The random state for KMeans clustering. Default is 0.\n n_init (int): Number of time the k-means algorithm will be run with different centroid seeds. Default is 10.\n\n Returns:\n cluster_result (numpy.ndarray): The result of KMeans clustering.\n ax (matplotlib.axes.Axes): The visualization of the KMeans clustering.\n\n Requirements:\n - numpy\n - sklearn\n - matplotlib\n\n Example:\n >>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])\n >>> T = np.random.rand(3, 3, 3)\n >>> cluster_result, ax = f_348(P, T, n_clusters=3, random_state=0, n_init=10)\n >>> type(cluster_result)\n \n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n tensor_shape = (3, 3, 3)\n if not T.shape == tensor_shape:\n raise ValueError(\"Provided tensor does not match the expected shape.\")\n\n # Using numpy for tensor product\n result = np.tensordot(P, T, axes=[1, 1]).swapaxes(0, 1)\n flattened_result = result.reshape(-1, tensor_shape[2]) # Flattening the result\n kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=n_init)\n cluster_result = kmeans.fit_predict(flattened_result)\n fig, ax = plt.subplots()\n ax.scatter(flattened_result[:, 0], flattened_result[:, 1], c=cluster_result)\n ax.set_title(\"KMeans Clustering Visualization\")\n return cluster_result, ax", "test": "import unittest\nimport numpy as np\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.random_seed = 0\n np.random.seed(self.random_seed)\n self.P = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])\n self.T = np.random.rand(3, 3, 3)\n def test_case_1(self):\n # Test with easy example\n P = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])\n T = np.array(\n [\n [[1, 0, 0], [0, 1, 1], [0, 0, 1]],\n [[1, 1, 0], [0, 1, 0], [0, 0, 1]],\n [[1, 0, 1], [0, 1, 0], [1, 0, 1]],\n ]\n )\n cluster_result, _ = f_348(P, T, n_clusters=3)\n self.assertEqual(len(np.unique(cluster_result)), 3)\n def test_case_2(self):\n # Test correct cluster centers (against manual calculated results)\n n_clusters = 3\n n_init = 10\n possible_labels = list(range(n_clusters))\n result, _ = f_348(self.P, self.T, random_state=self.random_seed, n_init=n_init)\n manual_results = KMeans(\n n_clusters=n_clusters, random_state=self.random_seed, n_init=n_init\n ).fit(\n np.tensordot(self.P, self.T, axes=[1, 1])\n .swapaxes(0, 1)\n .reshape(-1, n_clusters)\n )\n self.assertTrue((result == manual_results.labels_).all())\n self.assertEqual(result.shape, (self.P.shape[0] * n_clusters,))\n self.assertEqual(\n manual_results.cluster_centers_.shape, (n_clusters, n_clusters)\n )\n self.assertTrue((pred in possible_labels for pred in result))\n def test_case_3(self):\n # Test visualizations\n _, ax = f_348(self.P, self.T)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"KMeans Clustering Visualization\")\n num_data_points = len(ax.collections[0].get_offsets())\n self.assertEqual(num_data_points, self.P.shape[0] * 3)\n def test_case_4(self):\n # Test changing number of clusters\n for n_clusters in [1, 3, 5]:\n cluster_result, _ = f_348(self.P, self.T, n_clusters=n_clusters)\n unique_clusters = np.unique(cluster_result)\n self.assertEqual(len(unique_clusters), n_clusters)\n def test_case_5(self):\n # Function should fail with incompatible input - n_cluster and n_init\n for invalid in [-1, 0, \"invalid\"]:\n with self.assertRaises(Exception):\n f_348(self.P, self.T, n_clusters=invalid)\n def test_case_6(self):\n # Function should fail with incompatible input - shapes\n with self.assertRaises(ValueError):\n f_348(np.random.randn(2, 2), self.T)\n with self.assertRaises(ValueError):\n f_348(self.P, np.random.randn(2, 2))\n def test_case_7(self):\n # Function should fail with incompatible input - random_state\n with self.assertRaises(ValueError):\n f_348(self.P, self.T, random_state=\"invalid\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.tensordot", "sklearn.cluster.KMeans", "numpy.ndarray", "matplotlib.pyplot.subplots", "matplotlib.pyplot.Axes"], "libs": ["matplotlib", "numpy", "sklearn"], "doc": {"description": ["Calculate the product of a matrix 'P' and a 3D tensor 'T', flatten the result,", "apply KMeans clustering to the flattened data, and visualize it."], "note": [], "params": ["P (numpy.ndarray): The input matrix.", "T (numpy.ndarray): The input tensor with shape (3, 3, 3).", "n_clusters (int): The number of clusters for KMeans clustering. Default is 3.", "random_state (int): The random state for KMeans clustering. Default is 0.", "n_init (int): Number of time the k-means algorithm will be run with different centroid seeds. Default is 10."], "returns": ["cluster_result (numpy.ndarray): The result of KMeans clustering.", "ax (matplotlib.axes.Axes): The visualization of the KMeans clustering."], "reqs": ["numpy", "sklearn", "matplotlib"], "raises": [], "example": [">>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])", ">>> T = np.random.rand(3, 3, 3)", ">>> cluster_result, ax = f_348(P, T, n_clusters=3, random_state=0, n_init=10)", ">>> type(cluster_result)", "", ">>> type(ax)", ""]}} +{"task_id": "f_927", "prompt": "import pandas as pd\nfrom sklearn.preprocessing import MinMaxScaler\nimport matplotlib.pyplot as plt\n\n\ndef f_927(data):\n \"\"\"\n Processes a dataset containing salary information and experience, then plots normalized salary against experience.\n The function executes the following steps:\n 1. Input Validation: Checks if the input data dictionary contains the required keys ('Salary_String' and 'Experience').\n Raises a ValueError if the necessary keys are missing.\n 2. DataFrame Conversion: Converts the input data into a pandas DataFrame for easier manipulation.\n 3. Empty Data Handling: Checks if the DataFrame is empty. If so, it returns a default Axes instance with\n labeled axes but no data plotted. This handles cases where there is no data to plot.\n 4. Salary Conversion: Converts 'Salary_String' values from comma-separated strings to floats.\n It handles potential conversion errors by catching ValueErrors and re-raising them with a custom message.\n 5. Salary Normalization: Applies Min-Max scaling to normalize the salary values. This step transforms\n the salary data into a range between 0 and 1, allowing for easier comparison and visualization.\n 6. Data Plotting: Creates a scatter plot of the normalized salary against experience using matplotlib.\n The plot's axes are labeled accordingly.\n\n Parameters:\n - data (dict): A dictionary with two keys: 'Salary_String' and 'Experience'.\n 'Salary_String' should contain salary values as comma-separated strings.\n 'Experience' should contain corresponding experience values as integers.\n\n Returns:\n - matplotlib.axes.Axes: An Axes instance with the plotted scatter plot.\n\n Raises:\n - ValueError: If the input dictionary does not contain the required keys or if data conversion from string to float fails.\n\n Requirements:\n - pandas\n - sklearn\n - matplotlib\n\n Example:\n >>> ax = f_927({'Salary_String': ['1,000', '2,000', '3,000'], 'Experience': [1, 2, 3]})\n >>> print(ax.get_title())\n Normalized Salary vs Experience\n \"\"\"", "canonical_solution": " # Validate input data\n if not all(key in data for key in [\"Salary_String\", \"Experience\"]):\n raise ValueError(\n \"Input data must contain 'Salary_String' and 'Experience' keys.\"\n )\n\n # Convert data to DataFrame\n df = pd.DataFrame(data)\n\n # Check if the data is empty\n if df.empty:\n # Handle empty data case (e.g., return a default Axes instance or raise an error)\n _, ax = plt.subplots()\n ax.set_title(\"Normalized Salary vs Experience\")\n ax.set_xlabel(\"Experience\")\n ax.set_ylabel(\"Normalized Salary\")\n return ax\n\n # Convert Salary_String to float and handle potential conversion errors\n try:\n df[\"Salary_Float\"] = df[\"Salary_String\"].str.replace(\",\", \"\").astype(float)\n except ValueError:\n raise ValueError(\"Error converting Salary_String to float.\")\n\n # Normalize the Salary_Float values\n scaler = MinMaxScaler()\n df[\"Normalized_Salary\"] = scaler.fit_transform(df[[\"Salary_Float\"]])\n\n # Plot the data\n _, ax = plt.subplots()\n ax.scatter(df[\"Experience\"], df[\"Normalized_Salary\"])\n ax.set_title(\"Normalized Salary vs Experience\")\n ax.set_xlabel(\"Experience\")\n ax.set_ylabel(\"Normalized Salary\")\n\n return ax", "test": "import unittest\nimport pandas as pd\nfrom matplotlib.axes import Axes\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_927.\"\"\"\n def test_valid_data(self):\n \"\"\"Test with valid data.\"\"\"\n data = {\"Salary_String\": [\"1,000\", \"2,000\", \"3,000\"], \"Experience\": [1, 2, 3]}\n result = f_927(data)\n self.assertIsInstance(result, Axes)\n def test_missing_key(self):\n \"\"\"Test with missing key in input dictionary.\"\"\"\n data = {\"Salary_String\": [\"1,000\", \"2,000\", \"3,000\"]}\n with self.assertRaises(ValueError):\n f_927(data)\n def test_empty_data(self):\n \"\"\"Test with empty data.\"\"\"\n data = {\"Salary_String\": [], \"Experience\": []}\n result = f_927(data)\n self.assertIsInstance(result, Axes)\n def test_invalid_salary_format(self):\n \"\"\"Test with invalid salary format.\"\"\"\n data = {\n \"Salary_String\": [\"1.000\", \"2,000\", \"Three Thousand\"],\n \"Experience\": [1, 2, 3],\n }\n with self.assertRaises(ValueError):\n f_927(data)\n def test_mismatched_lengths(self):\n \"\"\"Test with mismatched lengths of salary and experience arrays.\"\"\"\n data = {\"Salary_String\": [\"1,000\", \"2,000\"], \"Experience\": [1, 2, 3]}\n with self.assertRaises(ValueError):\n f_927(data)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots", "sklearn.preprocessing.MinMaxScaler"], "libs": ["sklearn", "pandas", "matplotlib"], "doc": {"description": ["Processes a dataset containing salary information and experience, then plots normalized salary against experience.", "The function executes the following steps:", "1. Input Validation: Checks if the input data dictionary contains the required keys ('Salary_String' and 'Experience').", "Raises a ValueError if the necessary keys are missing.", "2. DataFrame Conversion: Converts the input data into a pandas DataFrame for easier manipulation.", "3. Empty Data Handling: Checks if the DataFrame is empty. If so, it returns a default Axes instance with", "labeled axes but no data plotted. This handles cases where there is no data to plot.", "4. Salary Conversion: Converts 'Salary_String' values from comma-separated strings to floats.", "It handles potential conversion errors by catching ValueErrors and re-raising them with a custom message.", "5. Salary Normalization: Applies Min-Max scaling to normalize the salary values. This step transforms", "the salary data into a range between 0 and 1, allowing for easier comparison and visualization.", "6. Data Plotting: Creates a scatter plot of the normalized salary against experience using matplotlib.", "The plot's axes are labeled accordingly."], "note": [], "params": ["data (dict): A dictionary with two keys: 'Salary_String' and 'Experience'.", "'Salary_String' should contain salary values as comma-separated strings.", "'Experience' should contain corresponding experience values as integers."], "returns": ["matplotlib.axes.Axes: An Axes instance with the plotted scatter plot."], "reqs": ["pandas", "sklearn", "matplotlib"], "raises": ["ValueError: If the input dictionary does not contain the required keys or if data conversion from string to float fails."], "example": [">>> ax = f_927({'Salary_String': ['1,000', '2,000', '3,000'], 'Experience': [1, 2, 3]})", ">>> print(ax.get_title())", "Normalized Salary vs Experience"]}} +{"task_id": "f_800", "prompt": "import string\nimport re\n\n\ndef f_800(text: str) -> tuple:\n \"\"\"\n Counts the number of words, characters, and unique characters in a given text.\n\n Parameters:\n - text (str): The input text to be analyzed.\n\n Returns:\n - tuple: A tuple containing three integers: the number of words,\n the number of characters,\n the number of unique characters.\n\n Requirements:\n - string\n - re\n\n Note:\n - This function considers whitespace-separated substrings as words.\n - When counting characters, this function excludes whitespace and special\n characters (i.e. string.punctuation).\n\n Example:\n >>> f_800('Hello, world!')\n (2, 10, 7)\n >>> f_800('Python is awesome! ')\n (3, 15, 12)\n \"\"\"", "canonical_solution": " words = text.split()\n chars = re.sub(\"\\s\", \"\", re.sub(f\"[{string.punctuation}]\", \"\", text))\n\n return len(words), len(chars), len(set(chars))", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test simple text without any punctuation.\n result = f_800(\"Hello world\")\n self.assertEqual(result, (2, 10, 7))\n def test_case_2(self):\n # Test simple text that includes punctuation.\n result = f_800(\"Hello, world!\")\n self.assertEqual(result, (2, 10, 7))\n def test_case_3(self):\n # Test single word and no punctuation.\n result = f_800(\"Hello\")\n self.assertEqual(result, (1, 5, 4))\n def test_case_4(self):\n # Test single word that includes punctuation.\n result = f_800(\"Hello!\")\n self.assertEqual(result, (1, 5, 4))\n def test_case_5(self):\n # Test empty string.\n result = f_800(\"\")\n self.assertEqual(result, (0, 0, 0))\n def test_case_6(self):\n # Test text with numbers and punctuation.\n result = f_800(\"There are 4 numbers here: 1, 2, 3, and 4.\")\n self.assertEqual(result, (10, 27, 15))\n def test_case_7(self):\n # Test text with only whitespace and punctuation.\n result = f_800(\" , , !\")\n self.assertEqual(result, (3, 0, 0))\n def test_case_8(self):\n # Test text with multiple spaces between words.\n result = f_800(\"Multiple spaces here\")\n self.assertEqual(result, (3, 18, 12))\n def test_case_9(self):\n # Test a long text.\n long_text = \"This is a longer text designed to test the function's ability to handle more complex input, including a variety of characters and spaces.\"\n result = f_800(long_text)\n self.assertEqual(result, (23, 112, 22))", "apis": ["re.sub", "string.punctuation"], "libs": ["re", "string"], "doc": {"description": ["Counts the number of words, characters, and unique characters in a given text."], "note": ["This function considers whitespace-separated substrings as words.", "When counting characters, this function excludes whitespace and special", "characters (i.e. string.punctuation)."], "params": ["text (str): The input text to be analyzed."], "returns": ["tuple: A tuple containing three integers: the number of words,", "the number of characters,", "the number of unique characters."], "reqs": ["string", "re"], "raises": [], "example": [">>> f_800('Hello, world!')", "(2, 10, 7)", ">>> f_800('Python is awesome! ')", "(3, 15, 12)"]}} +{"task_id": "f_785", "prompt": "import pandas as pd\nfrom datetime import datetime\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n# Constants\nSTART_DATE = '2016-01-01'\nPERIODS = 13\nFREQ = 'WOM-2FRI'\nCATEGORIES = ['Electronics', 'Fashion', 'Home & Kitchen', 'Automotive', 'Sports']\n\ndef f_785(start_date=START_DATE, periods=PERIODS, freq=FREQ, categories=CATEGORIES):\n \"\"\"\n Create and visualize a sales report for different categories over a period of time.\n \n Functionality:\n - Generates a DataFrame containing sales data for given categories over a time range.\n - Visualizes the sales data using a line plot.\n \n Input:\n - start_date (str): The start date for the report in 'YYYY-MM-DD' format. Default is '2016-01-01'.\n - periods (int): The number of periods for the report. Default is 13.\n - freq (str): The frequency of dates to be generated. Default is 'WOM-2FRI' (WeekOfMonth-2nd Friday).\n - categories (list): List of categories to include in the report. Default is ['Electronics', 'Fashion', 'Home & Kitchen', 'Automotive', 'Sports'].\n\n Output:\n - Returns a DataFrame containing the sales data.\n - Returns the Matplotlib Axes object for the plot.\n\n Requirements:\n - pandas\n - datetime\n - matplotlib.pyplot\n - numpy\n\n Example:\n >>> df, ax = f_785(start_date='2020-01-01', periods=5, freq='W-MON', categories=['Electronics', 'Fashion'])\n >>> df\n Date Category Sales\n 0 2020-01-06 Electronics 272\n 1 2020-01-06 Fashion 147\n 2 2020-01-13 Electronics 217\n 3 2020-01-13 Fashion 292\n 4 2020-01-20 Electronics 423\n 5 2020-01-20 Fashion 351\n 6 2020-01-27 Electronics 295\n 7 2020-01-27 Fashion 459\n 8 2020-02-03 Electronics 109\n 9 2020-02-03 Fashion 311\n \"\"\"", "canonical_solution": " np.random.seed(0) # Ensure reproducible sales figures\n date_range = pd.date_range(start=start_date, periods=periods, freq=freq)\n report_data = []\n\n for date in date_range:\n for category in categories:\n sales = np.random.randint(low=100, high=500)\n report_data.append([date, category, sales])\n\n sales_df = pd.DataFrame(report_data, columns=['Date', 'Category', 'Sales'])\n\n fig, ax = plt.subplots(figsize=(12, 8))\n sales_df.pivot(index='Date', columns='Category', values='Sales').plot(ax=ax)\n ax.set_title('Category-wise Sales Trends')\n ax.grid(True)\n \n return sales_df, ax", "test": "import unittest\nimport pandas as pd\n# Unit tests for the f_785 function\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n \"\"\"Test with default parameters.\"\"\"\n df, ax = f_785()\n self.assertIsInstance(df, pd.DataFrame)\n self.assertTrue(all(x in df.columns for x in ['Date', 'Category', 'Sales']))\n self.assertEqual(len(df['Category'].unique()), 5)\n self.assertEqual(ax.get_title(), 'Category-wise Sales Trends')\n def test_case_2(self):\n \"\"\"Test with custom start_date and periods.\"\"\"\n df, _ = f_785(start_date='2021-01-01', periods=7)\n self.assertTrue(df['Date'].min() >= pd.to_datetime('2021-01-01'))\n self.assertEqual(df['Date'].nunique(), 7)\n expected_rows = 7 * len(['Electronics', 'Fashion', 'Home & Kitchen', 'Automotive', 'Sports'])\n self.assertEqual(len(df), expected_rows)\n \n def test_case_3(self):\n \"\"\"Test with a different frequency and custom categories.\"\"\"\n df, _ = f_785(freq='W-TUE', categories=['Books', 'Games'])\n self.assertEqual(len(df['Category'].unique()), 2)\n self.assertTrue(all(category in ['Books', 'Games'] for category in df['Category'].unique()))\n def test_case_4(self):\n \"\"\"Test with all parameters customized.\"\"\"\n df, _ = f_785(start_date='2019-06-01', periods=10, freq='W-WED', categories=['Food', 'Clothing'])\n self.assertEqual(len(df['Category'].unique()), 2)\n self.assertTrue(all(category in ['Food', 'Clothing'] for category in df['Category'].unique()))\n def test_case_5(self):\n \"\"\"Test with a single category.\"\"\"\n df, _ = f_785(categories=['Electronics'])\n self.assertTrue(all(df['Category'] == 'Electronics'))\n self.assertEqual(len(df), 13) # Default periods", "apis": ["numpy.random.randint", "pandas.DataFrame", "numpy.random", "pandas.date_range", "matplotlib.pyplot.subplots", "numpy.random.seed"], "libs": ["pandas", "numpy", "matplotlib"], "doc": {"description": ["Create and visualize a sales report for different categories over a period of time.", "Functionality:", "- Generates a DataFrame containing sales data for given categories over a time range.", "- Visualizes the sales data using a line plot.", "Input:", "- start_date (str): The start date for the report in 'YYYY-MM-DD' format. Default is '2016-01-01'.", "- periods (int): The number of periods for the report. Default is 13.", "- freq (str): The frequency of dates to be generated. Default is 'WOM-2FRI' (WeekOfMonth-2nd Friday).", "- categories (list): List of categories to include in the report. Default is ['Electronics', 'Fashion', 'Home & Kitchen', 'Automotive', 'Sports'].", "Output:", "- Returns a DataFrame containing the sales data.", "- Returns the Matplotlib Axes object for the plot."], "note": [], "params": [], "returns": [], "reqs": ["pandas", "datetime", "matplotlib.pyplot", "numpy"], "raises": [], "example": [">>> df, ax = f_785(start_date='2020-01-01', periods=5, freq='W-MON', categories=['Electronics', 'Fashion'])", ">>> df", "Date Category Sales", "0 2020-01-06 Electronics 272", "1 2020-01-06 Fashion 147", "2 2020-01-13 Electronics 217", "3 2020-01-13 Fashion 292", "4 2020-01-20 Electronics 423", "5 2020-01-20 Fashion 351", "6 2020-01-27 Electronics 295", "7 2020-01-27 Fashion 459", "8 2020-02-03 Electronics 109", "9 2020-02-03 Fashion 311"]}} +{"task_id": "f_585", "prompt": "import pandas as pd\nimport numpy as np\n\ndef f_585(data, cols):\n \"\"\"\n Turn the provided data into a DataFrame and then calculate the correlation matrix of numeric columns.\n \n Parameters:\n - data (list): List of lists with the data, where the length of the inner list equals the number of columns\n - cols (list): List of column names\n \n Returns:\n - correlation_matrix (pd.DataFrame): The correlation matrix.\n\n Requirements:\n - pandas\n - numpy\n \n Example:\n >>> correlation_matrix = f_585([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], ['x', 'y', 'z'])\n >>> print(correlation_matrix)\n x y z\n x 1.000000 0.596040 0.866025\n y 0.596040 1.000000 0.114708\n z 0.866025 0.114708 1.000000\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data, columns=cols)\n \n df_np = np.array(df)\n df = pd.DataFrame(df_np, columns=cols)\n \n correlation_matrix = df.corr()\n return correlation_matrix", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))\n def test_case_2(self):\n df = pd.DataFrame([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))\n def test_case_3(self):\n df = pd.DataFrame([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))\n \n def test_case_4(self):\n df = pd.DataFrame([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))\n def test_case_5(self):\n df = pd.DataFrame([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0], [-7.0, -8.0, -9.0]], columns = ['x', 'y', 'z'])\n correlation_matrix = f_585([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0], [-7.0, -8.0, -9.0]], ['x', 'y', 'z'])\n self.assertTrue(np.allclose(correlation_matrix, df.corr()))", "apis": ["pandas.DataFrame", "numpy.array"], "libs": ["numpy", "pandas"], "doc": {"description": ["Turn the provided data into a DataFrame and then calculate the correlation matrix of numeric columns."], "note": [], "params": ["data (list): List of lists with the data, where the length of the inner list equals the number of columns", "cols (list): List of column names"], "returns": ["correlation_matrix (pd.DataFrame): The correlation matrix."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> correlation_matrix = f_585([[5.1, 3.5, 1.4], [4.9, 3.0, 1.4], [4.7, 3.2, 1.3]], ['x', 'y', 'z'])", ">>> print(correlation_matrix)", "x y z", "x 1.000000 0.596040 0.866025", "y 0.596040 1.000000 0.114708", "z 0.866025 0.114708 1.000000"]}} +{"task_id": "f_341", "prompt": "import string\nimport matplotlib.pyplot as plt\n\n\ndef f_341(s):\n \"\"\"\n Calculate the frequency of each letter in a string and return a bar chart of frequencies.\n Results are case-insensitive. If non-string input is provided, function will throw an error.\n\n Parameters:\n s (str): The string to calculate letter frequencies.\n\n Returns:\n tuple: A tuple containing:\n - dict: A dictionary with the frequency of each letter.\n - Axes: The plot of 'Letter Frequencies' with 'Letters' on the x-axis and 'Frequency'\n on the y-axis.\n\n Requirements:\n - string\n - matplotlib.pyplot\n\n Example:\n >>> s = 'This is a test string.'\n >>> freqs, ax = f_341(s)\n >>> freqs\n {'a': 1, 'b': 0, 'c': 0, 'd': 0, 'e': 1, 'f': 0, 'g': 1, 'h': 1, 'i': 3, 'j': 0, 'k': 0, 'l': 0, 'm': 0, 'n': 1, 'o': 0, 'p': 0, 'q': 0, 'r': 1, 's': 4, 't': 4, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0}\n >>> type(ax)\n \n \"\"\"", "canonical_solution": "\n if not isinstance(s, str):\n raise TypeError(\"Expected string input\")\n\n LETTERS = string.ascii_lowercase\n\n s = s.lower()\n\n letter_counts = {letter: s.count(letter) for letter in LETTERS}\n\n fig, ax = plt.subplots()\n ax.bar(letter_counts.keys(), letter_counts.values())\n ax.set_xlabel(\"Letters\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(\"Letter Frequencies\")\n\n return letter_counts, ax", "test": "import unittest\nimport string\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with a simple sentence\n s = \"This is a test string.\"\n expected_output = {\n letter: s.lower().count(letter) for letter in string.ascii_lowercase\n }\n result, ax = f_341(s)\n self.assertEqual(result, expected_output)\n self.assertEqual(ax.get_title(), \"Letter Frequencies\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_2(self):\n # Test with a string having all alphabets\n s = \"abcdefghijklmnopqrstuvwxyz\"\n expected_output = {letter: 1 for letter in string.ascii_lowercase}\n result, ax = f_341(s)\n self.assertEqual(result, expected_output)\n self.assertEqual(ax.get_title(), \"Letter Frequencies\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_3(self):\n # Test with a string having no alphabets\n s = \"1234567890!@#$%^&*()\"\n expected_output = {letter: 0 for letter in string.ascii_lowercase}\n result, ax = f_341(s)\n self.assertEqual(result, expected_output)\n self.assertEqual(ax.get_title(), \"Letter Frequencies\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_4(self):\n # Test with an empty string\n s = \"\"\n expected_output = {letter: 0 for letter in string.ascii_lowercase}\n result, ax = f_341(s)\n self.assertEqual(result, expected_output)\n self.assertEqual(ax.get_title(), \"Letter Frequencies\")\n self.assertEqual(ax.get_xlabel(), \"Letters\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_5(self):\n # Test error handling\n for invalid in [123, []]:\n with self.assertRaises(Exception):\n f_341(invalid)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "string.ascii_lowercase"], "libs": ["matplotlib", "string"], "doc": {"description": ["Calculate the frequency of each letter in a string and return a bar chart of frequencies.", "Results are case-insensitive. If non-string input is provided, function will throw an error."], "note": [], "params": ["s (str): The string to calculate letter frequencies."], "returns": ["tuple: A tuple containing:", "dict: A dictionary with the frequency of each letter.", "Axes: The plot of 'Letter Frequencies' with 'Letters' on the x-axis and 'Frequency'", "on the y-axis."], "reqs": ["string", "matplotlib.pyplot"], "raises": [], "example": [">>> s = 'This is a test string.'", ">>> freqs, ax = f_341(s)", ">>> freqs", "{'a': 1, 'b': 0, 'c': 0, 'd': 0, 'e': 1, 'f': 0, 'g': 1, 'h': 1, 'i': 3, 'j': 0, 'k': 0, 'l': 0, 'm': 0, 'n': 1, 'o': 0, 'p': 0, 'q': 0, 'r': 1, 's': 4, 't': 4, 'u': 0, 'v': 0, 'w': 0, 'x': 0, 'y': 0, 'z': 0}", ">>> type(ax)", ""]}} +{"task_id": "f_829", "prompt": "import json\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\n\n\ndef f_829(json_data: str, key_path: list):\n \"\"\"\n Extracts and visualizes numerical data from a JSON structure based on a specified path of keys.\n\n Parameters:\n json_data (str): JSON formatted string.\n key_path (list): List of strings representing the nested keys to locate the data within the JSON.\n\n Returns:\n matplotlib.figure.Figure: A matplotlib figure showing a boxplot of the data values.\n\n Raises:\n KeyError: If a specified key is not found.\n ValueError: If no numeric data is found, or the data string is empty or corrupted.\n\n Examples:\n >>> json_data = '{\"level1\":{\"level2\":{\"data\":\"1,2,3,4\"}}}'\n >>> key_path = ['level1', 'level2', 'data']\n >>> fig = f_829(json_data, key_path)\n >>> isinstance(fig, plt.Figure)\n True\n \"\"\"", "canonical_solution": " try:\n data = json.loads(json_data)\n for key in key_path:\n data = data[key]\n values = np.fromstring(data, sep=\",\")\n\n if values.size == 0:\n raise ValueError(\"No numeric data found or empty data string.\")\n df = pd.DataFrame(values, columns=[\"Values\"])\n\n fig, ax = plt.subplots()\n sns.boxplot(data=df, ax=ax)\n return fig\n\n except json.decoder.JSONDecodeError as e:\n raise ValueError(f\"Input malformed: {e}\")\n except KeyError as e:\n raise KeyError(f\"Key error occurred: {e}\")\n except ValueError as e:\n raise ValueError(f\"Value error occurred: {e}\")", "test": "import unittest\nimport warnings\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_correct_data_extraction(self):\n \"\"\"Tests correct extraction and visualization from valid JSON data.\"\"\"\n json_data = '{\"level1\":{\"level2\":{\"data\":\"1,2,3,4\"}}}'\n key_path = [\"level1\", \"level2\", \"data\"]\n fig = f_829(json_data, key_path)\n self.assertIsInstance(fig, plt.Figure)\n def test_missing_key_error(self):\n \"\"\"Tests response to missing key in JSON data.\"\"\"\n json_data = '{\"level1\":{}}'\n key_path = [\"level1\", \"level2\", \"data\"]\n with self.assertRaises(KeyError):\n f_829(json_data, key_path)\n def test_corrupted_json(self):\n \"\"\"Tests response to malformed data.\"\"\"\n key_path = [\"level1\", \"level2\", \"data\"]\n for x in [\"{'level1':{}}\", '{\"level1\":{\"level' \"invalid\", \"\"]:\n with self.assertRaises(ValueError):\n f_829(x, key_path)\n def test_empty_data_value_error(self):\n \"\"\"Tests response to empty numeric data.\"\"\"\n json_data = '{\"level1\":{\"level2\":{\"data\":\"\"}}}'\n key_path = [\"level1\", \"level2\", \"data\"]\n with self.assertRaises(ValueError):\n f_829(json_data, key_path)\n def test_non_numeric_data_value_error(self):\n \"\"\"Tests response to non-numeric data.\"\"\"\n json_data = '{\"level1\":{\"level2\":{\"data\":\"a,b,c\"}}}'\n key_path = [\"level1\", \"level2\", \"data\"]\n with warnings.catch_warnings():\n warnings.simplefilter(\"ignore\")\n with self.assertRaises(ValueError):\n f_829(json_data, key_path)", "apis": ["seaborn.boxplot", "json.loads", "numpy.fromstring", "pandas.DataFrame", "matplotlib.pyplot.subplots", "json.decoder"], "libs": ["numpy", "seaborn", "pandas", "json", "matplotlib"], "doc": {"description": ["Extracts and visualizes numerical data from a JSON structure based on a specified path of keys."], "note": [], "params": ["json_data (str): JSON formatted string.", "key_path (list): List of strings representing the nested keys to locate the data within the JSON."], "returns": ["matplotlib.figure.Figure: A matplotlib figure showing a boxplot of the data values."], "reqs": [], "raises": ["KeyError: If a specified key is not found.", "ValueError: If no numeric data is found, or the data string is empty or corrupted."], "example": ["Examples:", ">>> json_data = '{\"level1\":{\"level2\":{\"data\":\"1,2,3,4\"}}}'", ">>> key_path = ['level1', 'level2', 'data']", ">>> fig = f_829(json_data, key_path)", ">>> isinstance(fig, plt.Figure)", "True"]}} +{"task_id": "f_872", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_872(rows=100, columns=3):\n \"\"\"\n Create a Pandas DataFrame with random alphabets in each cell.\n The DataFrame will have a specified number of rows and columns.\n Each column is named with a string from the list ['a', 'b', 'c', ...]\n depending on the number of columns specified.\n\n Parameters:\n - rows (int, optional): Number of rows in the DataFrame. Defaults to 100.\n - columns (int, optional): Number of columns in the DataFrame. Defaults to 3.\n\n Returns:\n DataFrame: A pandas DataFrame with random alphabets.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> np.random.seed(0)\n >>> df = f_872(5, 3)\n >>> print(df)\n a b c\n 0 m p v\n 1 a d d\n 2 h j t\n 3 v s e\n 4 x g y\n >>> df['a'].value_counts()\n a\n m 1\n a 1\n h 1\n v 1\n x 1\n Name: count, dtype: int64\n \"\"\"", "canonical_solution": " column_names = [\n chr(97 + i) for i in range(columns)\n ] # generate column names based on the number of columns\n values = list(\"abcdefghijklmnopqrstuvwxyz\")\n data = np.random.choice(values, size=(rows, columns))\n df = pd.DataFrame(data, columns=column_names)\n return df", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Tests case for function `f_872`.\"\"\"\n def test_dataframe_shape_default(self):\n \"\"\"Test if the DataFrame has default shape (100 rows, 3 columns) with default parameters.\"\"\"\n np.random.seed(1)\n df_test = f_872()\n self.assertEqual(df_test.shape, (100, 3))\n def test_dataframe_shape_custom_rows(self):\n \"\"\"Test if the DataFrame has the correct shape when a custom number of rows is specified.\"\"\"\n np.random.seed(2)\n df_test = f_872(50)\n self.assertEqual(df_test.shape, (50, 3))\n def test_dataframe_shape_custom_columns(self):\n \"\"\"Test if the DataFrame has the correct shape with a custom number of columns.\"\"\"\n np.random.seed(3)\n df_test = f_872(50, 5)\n self.assertEqual(df_test.shape, (50, 5))\n def test_dataframe_columns_default(self):\n \"\"\"Test if the DataFrame has default column names ['a', 'b', 'c'] with default parameters.\"\"\"\n np.random.seed(4)\n df_test = f_872()\n self.assertListEqual(list(df_test.columns), [\"a\", \"b\", \"c\"])\n def test_dataframe_columns_custom(self):\n \"\"\"Test if the DataFrame has the correct column names when a custom number of columns is specified.\"\"\"\n np.random.seed(5)\n df_test = f_872(columns=5)\n expected_columns = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n self.assertListEqual(list(df_test.columns), expected_columns)\n def test_dataframe_values(self):\n \"\"\"Test if each cell in the DataFrame contains a letter from the English alphabet.\"\"\"\n np.random.seed(6)\n df_test = f_872()\n for col in df_test.columns:\n self.assertTrue(\n set(df_test[col].unique()).issubset(set(\"abcdefghijklmnopqrstuvwxyz\"))\n )\n def test_dataframe_empty(self):\n \"\"\"Test if an empty DataFrame is created when 0 rows are specified.\"\"\"\n np.random.seed(7)\n df_test = f_872(0)\n self.assertEqual(df_test.shape, (0, 3))", "apis": ["pandas.DataFrame", "numpy.random", "numpy.random.choice"], "libs": ["numpy", "pandas"], "doc": {"description": ["Create a Pandas DataFrame with random alphabets in each cell.", "The DataFrame will have a specified number of rows and columns.", "Each column is named with a string from the list ['a', 'b', 'c', ...]", "depending on the number of columns specified."], "note": [], "params": ["rows (int, optional): Number of rows in the DataFrame. Defaults to 100.", "columns (int, optional): Number of columns in the DataFrame. Defaults to 3."], "returns": ["DataFrame: A pandas DataFrame with random alphabets."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> np.random.seed(0)", ">>> df = f_872(5, 3)", ">>> print(df)", "a b c", "0 m p v", "1 a d d", "2 h j t", "3 v s e", "4 x g y", ">>> df['a'].value_counts()", "a", "m 1", "a 1", "h 1", "v 1", "x 1", "Name: count, dtype: int64"]}} +{"task_id": "f_799", "prompt": "import re\nimport string\nimport random\n\n\ndef f_799(text: str, seed=None) -> str:\n \"\"\"\n Transforms a given string by removing special characters, normalizing whitespace,\n and randomizing character casing.\n\n Parameters:\n - text (str): The text string to be preprocessed.\n - seed (int, optional): Random seed for reproducibility. Defaults to None (not set).\n\n Returns:\n - str: The preprocessed text string.\n\n Requirements:\n - re\n - string\n - random\n\n Note:\n - This function considers special characters to be string punctuations.\n - Spaces, tabs, and newlines are replaced with with '_', '__', and '___' respectively.\n - To randomize casing, this function converts characters to uppercase with a 50% probability.\n\n Example:\n >>> f_799('Hello World!', 0)\n 'HeLlo___WORlD'\n >>> f_799('attention is all you need', 42)\n 'ATtENTIOn_IS_ALL_You_Need'\n \"\"\"", "canonical_solution": "\n if seed is not None:\n random.seed(seed)\n\n text = re.sub(\"[%s]\" % re.escape(string.punctuation), \"\", text)\n\n REPLACEMENTS = {\" \": \"_\", \"\\t\": \"__\", \"\\n\": \"___\"}\n for k, v in REPLACEMENTS.items():\n text = text.replace(k, v)\n\n text = \"\".join(random.choice([k.upper(), k]) for k in text)\n\n return text", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n result = f_799(\"Hello World!\", seed=1)\n self.assertNotIn(\" \", result, \"Spaces should be replaced.\")\n self.assertNotIn(\"!\", result, \"Special characters should be removed.\")\n self.assertEqual(\n len(result), len(\"Hello___World\"), \"Length should match processed input.\"\n )\n def test_case_2(self):\n result = f_799(\"Python!\", seed=2)\n self.assertNotIn(\"!\", result, \"Special characters should be removed.\")\n self.assertEqual(\n len(result), len(\"Python\"), \"Length should match processed input.\"\n )\n def test_case_3(self):\n result = f_799(\" \", seed=3)\n self.assertEqual(result, \"__\", \"Spaces should be replaced with underscores.\")\n def test_case_4(self):\n result = f_799(\"\\t\\n\", seed=4)\n self.assertEqual(\n result, \"_____\", \"Tab and newline should be replaced with underscores.\"\n )\n def test_case_5(self):\n result = f_799(\"a!b@c#\", seed=5)\n self.assertTrue(result.isalpha(), \"Output should only contain alphabets.\")\n self.assertEqual(\n len(result), len(\"abc\"), \"Length should match processed input.\"\n )\n def test_case_6(self):\n # Test with all types of whitespace characters\n result = f_799(\"a b\\tc\\nd\", seed=6)\n self.assertEqual(\n result.lower(),\n \"a_b__c___d\",\n \"Should replace all types of whitespaces correctly.\",\n )\n def test_case_7(self):\n # Test with a mix of alphanumeric and special characters\n result = f_799(\"a1! b2@ c3#\", seed=7)\n self.assertTrue(\n all(char.isalnum() or char == \"_\" for char in result),\n \"Should only contain alphanumeric characters and underscores.\",\n )\n def test_case_8(self):\n # Test with an empty string\n result = f_799(\"\", seed=8)\n self.assertEqual(result, \"\", \"Should handle empty string correctly.\")\n def test_case_9(self):\n # Test with a string that contains no special characters or whitespaces\n result = f_799(\"abcdefg\", seed=9)\n self.assertTrue(result.isalpha(), \"Should contain only letters.\")\n self.assertEqual(len(result), 7, \"Length should match the input.\")\n def test_case_10(self):\n # Test with a long string of repeated characters\n result = f_799(\"a\" * 50, seed=10)\n self.assertTrue(\n all(char.lower() == \"a\" for char in result),\n \"All characters should be 'a' or 'A'.\",\n )\n self.assertEqual(len(result), 50, \"Length should match the input.\")\n def test_case_11(self):\n # Test with only special characters\n result = f_799(\"!@#$%^&*\", seed=11)\n self.assertEqual(\n result, \"\", \"Should return an empty string for only special characters.\"\n )\n def test_case_12(self):\n # Test with numeric characters\n result = f_799(\"12345\", seed=13)\n self.assertTrue(result.isdigit(), \"Should contain only digits.\")\n self.assertEqual(len(result), 5, \"Length should match the input.\")\n def test_case_13(self):\n # Test with a string containing only whitespace characters\n result = f_799(\" \\t\\n\", seed=14)\n self.assertEqual(\n result,\n \"______\",\n \"Should replace all types of whitespaces correctly, with two underscores for tab and three for newline.\",\n )\n def test_case_14(self):\n # Test the randomness of uppercase conversion with a long string\n result = f_799(\"a\" * 100, seed=15)\n self.assertTrue(\n all(char.lower() == \"a\" for char in result),\n \"All characters should be 'a' or 'A'.\",\n )\n self.assertNotEqual(\n result, \"a\" * 100, \"Should have some uppercase transformations.\"\n )\n self.assertNotEqual(\n result, \"A\" * 100, \"Should have some lowercase transformations.\"\n )\n def test_case_15(self):\n # Test random seed impact\n result1 = f_799(\"test seed impact\", seed=42)\n result2 = f_799(\"test seed impact\", seed=42)\n self.assertEqual(\n result1, result2, \"Results with the same seed should be identical.\"\n )", "apis": ["re.sub", "random.seed", "random.choice", "re.escape", "string.punctuation"], "libs": ["re", "random", "string"], "doc": {"description": ["Transforms a given string by removing special characters, normalizing whitespace,", "and randomizing character casing."], "note": ["This function considers special characters to be string punctuations.", "Spaces, tabs, and newlines are replaced with with '_', '__', and '___' respectively.", "To randomize casing, this function converts characters to uppercase with a 50% probability."], "params": ["text (str): The text string to be preprocessed.", "seed (int, optional): Random seed for reproducibility. Defaults to None (not set)."], "returns": ["str: The preprocessed text string."], "reqs": ["re", "string", "random"], "raises": [], "example": [">>> f_799('Hello World!', 0)", "'HeLlo___WORlD'", ">>> f_799('attention is all you need', 42)", "'ATtENTIOn_IS_ALL_You_Need'"]}} +{"task_id": "f_883", "prompt": "import ssl\nimport os\nimport hashlib\n\n\ndef f_883(client_socket, cert_file, key_file, buffer_size=1024):\n \"\"\"\n This function secures a client socket using SSL/TLS and sends back the SHA256 hash of a file requested by the client. \n\n Parameters:\n - client_socket (socket.socket): The client socket that will be wrapped with SSL/TLS for secure communication.\n - cert_file (str): The file path to the SSL certificate to be used for the secure connection.\n - key_file (str): The file path to the SSL key corresponding to the certificate.\n - buffer_size (int, optional): The size of the buffer used to receive data from the client. Defaults to 1024 bytes.\n\n Returns:\n - str: The SHA256 hash of the requested file. If the requested file does not exist, returns 'File not found'. \n In case of an exception during processing, an error message is returned.\n\n Requirements:\n - ssl\n - os\n - hashlib\n\n Note:\n - This function assumes that the client requests a file by sending its path.\n - The function does not handle the opening or closing of the client_socket itself.\n - Error handling is basic and might need to be expanded based on specific use cases.\n \n Example:\n >>> # Server setup\n >>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n >>> server_socket.bind(('localhost', 443))\n >>> server_socket.listen(5)\n >>> cert_file = \"path/to/certificate.crt\"\n >>> key_file = \"path/to/private.key\"\n >>> # Accept client connection\n >>> client_socket, addr = server_socket.accept()\n >>> # Use f_883 function to handle the client request\n >>> file_hash = f_883(client_socket, cert_file, key_file)\n >>> print(\"Sent file hash:\", file_hash)\n >>> server_socket.close()\n \"\"\"", "canonical_solution": " context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)\n context.load_cert_chain(certfile=cert_file, keyfile=key_file)\n secure_socket = None\n try:\n secure_socket = context.wrap_socket(client_socket, server_side=True)\n request = secure_socket.recv(buffer_size).decode(\"utf-8\")\n\n if os.path.exists(request):\n with open(request, \"rb\") as file:\n sha256_hash = hashlib.sha256()\n for byte_block in iter(lambda: file.read(4096), b\"\"):\n sha256_hash.update(byte_block)\n response = sha256_hash.hexdigest()\n else:\n response = \"File not found\"\n\n secure_socket.send(response.encode(\"utf-8\"))\n except Exception as e:\n response = f\"Error: {str(e)}\"\n finally:\n if secure_socket:\n secure_socket.close()\n\n return response", "test": "import unittest\nfrom unittest.mock import MagicMock, patch\nimport ssl\nimport os\nimport hashlib\nclass TestCases(unittest.TestCase):\n \"\"\"Unit tests for f_883.\"\"\"\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_file_found(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function returns the correct SHA256 hash when the file exists.\"\"\"\n # Mocking the certificate and key file paths\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking the SSL context and secure socket\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Mocking the request and response\n mock_request = \"path/to/requested_file.txt\"\n mock_secure_socket.recv.return_value = mock_request.encode(\"utf-8\")\n # Mock file existence and content for hashing\n with patch(\"os.path.exists\") as mock_exists:\n mock_exists.return_value = True\n with patch(\n \"builtins.open\", unittest.mock.mock_open(read_data=b\"file content\")\n ) as mock_file:\n # Call the function\n result = f_883(mock_socket, cert_file, key_file)\n # Check if file was opened\n mock_file.assert_called_with(mock_request, \"rb\")\n # Create expected hash\n expected_hash = hashlib.sha256(b\"file content\").hexdigest()\n # Assertions\n self.assertEqual(result, expected_hash)\n mock_context.wrap_socket.assert_called_with(\n mock_socket, server_side=True\n )\n mock_secure_socket.send.assert_called()\n mock_secure_socket.close.assert_called()\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_file_not_found(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function returns 'File not found' if the requested file does not exist.\"\"\"\n # Mocking the certificate and key file paths\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking the SSL context and secure socket\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Mocking the request\n mock_request = \"path/to/nonexistent_file.txt\"\n mock_secure_socket.recv.return_value = mock_request.encode(\"utf-8\")\n # Mock file existence\n with patch(\"os.path.exists\") as mock_exists:\n mock_exists.return_value = False\n # Call the function\n result = f_883(mock_socket, cert_file, key_file)\n # Assertions\n self.assertEqual(result, \"File not found\")\n mock_context.wrap_socket.assert_called_with(mock_socket, server_side=True)\n mock_secure_socket.send.assert_called_with(\n \"File not found\".encode(\"utf-8\")\n )\n mock_secure_socket.close.assert_called()\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_exception_handling(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function handles exceptions properly.\"\"\"\n # Mocking the certificate and key file paths\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking the SSL context and setting up to raise an exception\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Configuring the secure_socket to raise an exception when recv is called\n mock_secure_socket.recv.side_effect = Exception(\"Test exception\")\n # Call the function and verify that it handles the exception\n result = f_883(mock_socket, cert_file, key_file)\n # Assertions\n self.assertTrue(\"Error: Test exception\" in result)\n mock_context.wrap_socket.assert_called_with(mock_socket, server_side=True)\n mock_secure_socket.close.assert_called()\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_f_883_empty_file(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function returns the correct SHA256 hash for an empty file.\"\"\"\n # Setup for empty file scenario\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking SSL context and secure socket\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Mocking the request for an empty file\n mock_request = \"path/to/empty_file.txt\"\n mock_secure_socket.recv.return_value = mock_request.encode(\"utf-8\")\n with patch(\"os.path.exists\") as mock_exists, patch(\n \"builtins.open\", unittest.mock.mock_open(read_data=b\"\")\n ) as mock_file: # Note the b'' for empty bytes\n mock_exists.return_value = True\n # Call the function\n result = f_883(mock_socket, cert_file, key_file)\n # Expected hash for an empty file\n expected_hash = hashlib.sha256(b\"\").hexdigest() # Hash of empty bytes\n # Assertions\n self.assertEqual(result, expected_hash)\n mock_file.assert_called_with(mock_request, \"rb\")\n @patch(\"ssl.SSLContext\")\n @patch(\"socket.socket\")\n def test_f_883_large_file(self, mock_socket, mock_ssl_context):\n \"\"\"Test that the function returns the correct SHA256 hash for a large file.\"\"\"\n # Setup for large file scenario\n cert_file = \"path/to/certificate.crt\"\n key_file = \"path/to/private.key\"\n # Mocking SSL context and secure socket\n mock_context = MagicMock()\n mock_ssl_context.return_value = mock_context\n mock_secure_socket = MagicMock()\n mock_context.wrap_socket.return_value = mock_secure_socket\n # Mocking the request for a large file\n mock_request = \"path/to/large_file.txt\"\n mock_secure_socket.recv.return_value = mock_request.encode(\"utf-8\")\n large_file_content = b\"a\" * 10**6 # 1 MB of data\n with patch(\"os.path.exists\") as mock_exists, patch(\n \"builtins.open\", unittest.mock.mock_open(read_data=large_file_content)\n ) as mock_file:\n mock_exists.return_value = True\n # Call the function\n result = f_883(mock_socket, cert_file, key_file)\n # Expected hash for the large file\n expected_hash = hashlib.sha256(large_file_content).hexdigest()\n # Assertions\n self.assertEqual(result, expected_hash)\n mock_file.assert_called_with(mock_request, \"rb\")", "apis": ["ssl.PROTOCOL_TLS_SERVER", "hashlib.sha256", "ssl.SSLContext", "os.path", "os.path.exists"], "libs": ["os", "ssl", "hashlib"], "doc": {"description": ["This function secures a client socket using SSL/TLS and sends back the SHA256 hash of a file requested by the client."], "note": ["This function assumes that the client requests a file by sending its path.", "The function does not handle the opening or closing of the client_socket itself.", "Error handling is basic and might need to be expanded based on specific use cases."], "params": ["client_socket (socket.socket): The client socket that will be wrapped with SSL/TLS for secure communication.", "cert_file (str): The file path to the SSL certificate to be used for the secure connection.", "key_file (str): The file path to the SSL key corresponding to the certificate.", "buffer_size (int, optional): The size of the buffer used to receive data from the client. Defaults to 1024 bytes."], "returns": ["str: The SHA256 hash of the requested file. If the requested file does not exist, returns 'File not found'.", "In case of an exception during processing, an error message is returned."], "reqs": ["ssl", "os", "hashlib"], "raises": [], "example": [">>> # Server setup", ">>> server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)", ">>> server_socket.bind(('localhost', 443))", ">>> server_socket.listen(5)", ">>> cert_file = \"path/to/certificate.crt\"", ">>> key_file = \"path/to/private.key\"", ">>> # Accept client connection", ">>> client_socket, addr = server_socket.accept()", ">>> # Use f_883 function to handle the client request", ">>> file_hash = f_883(client_socket, cert_file, key_file)", ">>> print(\"Sent file hash:\", file_hash)", ">>> server_socket.close()"]}} +{"task_id": "f_858", "prompt": "import requests\nfrom lxml import html\nimport pandas as pd\nimport sqlite3\n\n\ndef f_858(webpage_url: str, database_name: str = \"my_database.db\") -> int:\n \"\"\"\n This function parses HTML table data from a specified URL or local file and stores it into an SQLite database.\n The function handles different scenarios for fetching, processing, and storing data.\n\n Parameters:\n - webpage_url (str): The URL of the webpage or a local file path prefixed with \"file://\".\n - database_name (str): The name of the SQLite database file where the data is to be stored. Defaults to \"my_database.db\".\n\n Returns:\n - int: The number of rows in the parsed HTML table.\n\n Raises:\n - requests.RequestException: This exception is raised if there is a network issue in accessing the URL. \n This includes scenarios like connection errors, timeouts, and HTTP errors.\n - sqlite3.DatabaseError: This exception is raised in case of issues connecting to, or writing to, the SQLite database. \n This includes issues like invalid database names, write permissions, or SQL execution errors.\n\n Notes:\n - The function is designed to replace the table \"my_table\" in the specified SQLite database with new data each time it is called.\n - If the HTML content does not contain a table or if the table is empty, the function will return 0, indicating no rows were parsed and stored.\n - This function relies on the 'requests', 'lxml', 'pandas', and 'sqlite3' libraries for its operations.\n\n Requirements:\n - requests\n - lxml\n - pandas\n - sqlite3\n \n Example:\n >>> num_rows = f_858(\"http://example.com/tabledata\")\n >>> print(f\"Number of rows parsed: {num_rows}\")\n Number of rows parsed: 5\n \"\"\"", "canonical_solution": " try:\n if webpage_url.startswith(\"file://\"):\n with open(webpage_url[7:], \"r\", encoding=\"utf-8\") as file:\n content = file.read()\n else:\n response = requests.get(webpage_url, timeout=5)\n response.raise_for_status()\n content = response.content\n\n tree = html.fromstring(content)\n rows = tree.xpath(\"//tr\")\n data = [\n [cell.text_content().strip() for cell in row.xpath(\".//td\")] for row in rows\n ]\n\n # Create DataFrame\n df = pd.DataFrame(data)\n if df.empty:\n return 0\n\n # Store data in database\n conn = None\n try:\n conn = sqlite3.connect(database_name)\n df.to_sql(\"my_table\", conn, if_exists=\"replace\", index=False)\n finally:\n if conn:\n conn.close()\n\n return len(df)\n\n except requests.RequestException as e:\n raise requests.RequestException(f\"Error accessing URL {webpage_url}: {e}\")\n except sqlite3.DatabaseError as e:\n raise sqlite3.DatabaseError(f\"Database error with {database_name}: {e}\")", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport requests\nimport sqlite3\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_858.\"\"\"\n @patch(\"requests.get\")\n def test_valid_webpage_url(self, mock_get):\n \"\"\"\n Test processing HTML table data from a valid webpage URL.\n \"\"\"\n mock_response = MagicMock()\n mock_response.content = (\n b\"
1
\"\n )\n mock_response.status_code = 200\n mock_get.return_value = mock_response\n result = f_858(\"http://example.com\")\n self.assertEqual(result, 1)\n @patch(\n \"builtins.open\",\n new_callable=unittest.mock.mock_open,\n read_data=\"
1
\",\n )\n def test_local_file_url(self, mock_file):\n \"\"\"\n Test processing HTML table data from a local file.\n \"\"\"\n result = f_858(\"file:///path/to/file.html\")\n self.assertEqual(result, 1)\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"\n Test function behavior with an invalid URL.\n \"\"\"\n mock_get.side_effect = requests.RequestException(\"mocked request exception\")\n with self.assertRaises(requests.RequestException):\n f_858(\"http://invalid-url.com\")\n @patch(\"requests.get\")\n def test_empty_table(self, mock_get):\n \"\"\"\n Test handling an HTML page with an empty table.\n \"\"\"\n mock_response = MagicMock()\n mock_response.content = b\"
\"\n mock_response.status_code = 200\n mock_get.return_value = mock_response\n result = f_858(\"http://example.com/empty\")\n self.assertEqual(result, 0)\n @patch(\"requests.get\")\n @patch(\"sqlite3.connect\")\n def test_database_error(self, mock_connect, mock_get):\n \"\"\"\n Test function behavior when encountering a database error.\n \"\"\"\n # Mock the response from requests.get\n mock_response = MagicMock()\n mock_response.content = (\n b\"
Data
\"\n )\n mock_response.status_code = 200\n mock_get.return_value = mock_response\n # Simulate a database error\n mock_connect.side_effect = sqlite3.DatabaseError(\"mocked database error\")\n # Expect a DatabaseError to be raised\n with self.assertRaises(sqlite3.DatabaseError):\n f_858(\"http://example.com\", \"faulty_database.db\")\n @classmethod\n def tearDownClass(cls):\n \"\"\"Remove the database file with retries.\"\"\"\n if os.path.exists(\"my_database.db\"):\n os.remove(\"my_database.db\")", "apis": ["requests.get", "pandas.DataFrame", "sqlite3.connect", "sqlite3.DatabaseError", "requests.RequestException", "lxml.html.fromstring"], "libs": ["lxml", "pandas", "sqlite3", "requests"], "doc": {"description": ["This function parses HTML table data from a specified URL or local file and stores it into an SQLite database.", "The function handles different scenarios for fetching, processing, and storing data.", "Notes:", "- The function is designed to replace the table \"my_table\" in the specified SQLite database with new data each time it is called.", "- If the HTML content does not contain a table or if the table is empty, the function will return 0, indicating no rows were parsed and stored.", "- This function relies on the 'requests', 'lxml', 'pandas', and 'sqlite3' libraries for its operations."], "note": [], "params": ["webpage_url (str): The URL of the webpage or a local file path prefixed with \"file://\".", "database_name (str): The name of the SQLite database file where the data is to be stored. Defaults to \"my_database.db\"."], "returns": ["int: The number of rows in the parsed HTML table."], "reqs": ["requests", "lxml", "pandas", "sqlite3"], "raises": ["requests.RequestException: This exception is raised if there is a network issue in accessing the URL.", "This includes scenarios like connection errors, timeouts, and HTTP errors.", "sqlite3.DatabaseError: This exception is raised in case of issues connecting to, or writing to, the SQLite database.", "This includes issues like invalid database names, write permissions, or SQL execution errors."], "example": [">>> num_rows = f_858(\"http://example.com/tabledata\")", ">>> print(f\"Number of rows parsed: {num_rows}\")", "Number of rows parsed: 5"]}} +{"task_id": "f_356", "prompt": "import numpy as np\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndef f_356(n_components=2, N_SAMPLES=500, N_FEATURES=50, random_seed=None):\n \"\"\"\n Generate a high-dimensional dataset, run PCA to reduce its dimensionality, and then draw a heatmap of\n the covariance matrix of the transformed data.\n\n Parameters:\n n_components (int, optional): The number of components for PCA. Defaults to 2.\n N_SAMPLES (int, optional): Number of samples in the dataset. Defaults to 500.\n N_FEATURES (int, optional): Number of features in the dataset. Defaults to 50.\n random_seed (int, optional): Seed for the numpy and sklearn random number generator. Defaults to None.\n\n Returns:\n tuple:\n transformed_data (ndarray): The transformed data of shape (N_SAMPLES, n_components).\n heatmap_axes (Axes): The heatmap of the covariance matrix of the transformed data or None if n_components=1.\n\n Requirements:\n - numpy\n - sklearn.decomposition.PCA\n - matplotlib.pyplot\n - seaborn\n\n Example:\n >>> transformed, ax = f_356(n_components=2, random_seed=42)\n >>> transformed.shape\n (500, 2)\n \"\"\"", "canonical_solution": " np.random.seed(random_seed) # Ensuring reproducibility\n X = np.random.rand(N_SAMPLES, N_FEATURES)\n\n pca = PCA(n_components=n_components, random_state=random_seed)\n X_transformed = pca.fit_transform(X)\n\n if n_components == 1:\n return X_transformed, None\n\n fig, ax = plt.subplots(figsize=(10, 7))\n sns.heatmap(np.cov(X_transformed.T), annot=True, fmt=\".2f\", ax=ax)\n\n return X_transformed, ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.decomposition import PCA\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.seed = 42\n # default parameters\n self.n_components = 2\n self.N_SAMPLES = 500\n self.N_FEATURES = 50\n def test_case_1(self):\n # Test basic functionality - results\n transformed_data, _ = f_356()\n self.assertEqual(transformed_data.shape, (self.N_SAMPLES, self.n_components))\n np.random.seed(self.seed)\n X = np.random.rand(self.N_SAMPLES, self.N_FEATURES)\n pca = PCA(n_components=self.n_components, random_state=self.seed)\n pca.fit(X)\n self.assertTrue(np.sum(pca.explained_variance_ratio_) <= 1)\n def test_case_2(self):\n # Test basic functionality - visualization\n _, heatmap_axes = f_356()\n self.assertIsNotNone(heatmap_axes)\n self.assertIsInstance(heatmap_axes, plt.Axes)\n self.assertEqual(len(heatmap_axes.get_xticklabels()), 2)\n self.assertEqual(len(heatmap_axes.get_yticklabels()), 2)\n def test_case_3(self):\n # Test n_components\n for n_components in [1, 10, self.N_FEATURES]:\n transformed_data, _ = f_356(\n n_components=n_components, N_FEATURES=self.N_FEATURES\n )\n self.assertEqual(transformed_data.shape, (self.N_SAMPLES, n_components))\n def test_case_4(self):\n # Test N_SAMPLES\n for n_samples in [self.n_components, 10, 50, 100]:\n transformed_data, _ = f_356(N_SAMPLES=n_samples)\n self.assertEqual(transformed_data.shape, (n_samples, self.n_components))\n def test_case_5(self):\n # Test N_FEATURES\n for n_features in [self.n_components, 10, 50, 100]:\n transformed_data, _ = f_356(N_FEATURES=n_features)\n self.assertEqual(\n transformed_data.shape, (self.N_SAMPLES, self.n_components)\n )\n def test_case_6(self):\n # Test random_seed\n transformed_data1, _ = f_356(random_seed=self.seed)\n transformed_data2, _ = f_356(random_seed=self.seed)\n np.testing.assert_array_equal(transformed_data1, transformed_data2)\n transformed_data2, _ = f_356(random_seed=0)\n with self.assertRaises(AssertionError):\n np.testing.assert_array_equal(transformed_data1, transformed_data2)\n def test_case_7(self):\n # Function should fail at invalid values\n with self.assertRaises(ValueError):\n # negative n_components\n f_356(n_components=-1)\n with self.assertRaises(ValueError):\n # more components than features\n f_356(n_components=self.N_FEATURES + 10, N_FEATURES=self.N_FEATURES)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["seaborn.heatmap", "numpy.random", "numpy.cov", "numpy.random.rand", "matplotlib.pyplot.subplots", "numpy.random.seed", "sklearn.decomposition.PCA"], "libs": ["sklearn", "seaborn", "numpy", "matplotlib"], "doc": {"description": ["Generate a high-dimensional dataset, run PCA to reduce its dimensionality, and then draw a heatmap of", "the covariance matrix of the transformed data."], "note": [], "params": ["n_components (int, optional): The number of components for PCA. Defaults to 2.", "N_SAMPLES (int, optional): Number of samples in the dataset. Defaults to 500.", "N_FEATURES (int, optional): Number of features in the dataset. Defaults to 50.", "random_seed (int, optional): Seed for the numpy and sklearn random number generator. Defaults to None."], "returns": ["tuple:", "transformed_data (ndarray): The transformed data of shape (N_SAMPLES, n_components).", "heatmap_axes (Axes): The heatmap of the covariance matrix of the transformed data or None if n_components=1."], "reqs": ["numpy", "sklearn.decomposition.PCA", "matplotlib.pyplot", "seaborn"], "raises": [], "example": [">>> transformed, ax = f_356(n_components=2, random_seed=42)", ">>> transformed.shape", "(500, 2)"]}} +{"task_id": "f_893", "prompt": "import re\nimport pandas as pd\n\n\ndef f_893(input_string: str) -> pd.DataFrame:\n \"\"\"\n Process a multi-line string by replacing tabs with spaces and converting it into a pandas DataFrame.\n Each non-empty line of the input string is transformed into a separate row in the DataFrame.\n The function specifically filters out empty lines and replaces tabs with single spaces in the remaining lines.\n\n Parameters:\n - input_string (str): A multi-line string. Each line is separated by a newline character ('\\\\n').\n\n Returns:\n - pd.DataFrame: A DataFrame with a single column named 'Text'. Each row in this column corresponds to a non-empty\n line from the input string, with tabs replaced by spaces.\n\n Requirements:\n - re\n - pandas\n\n Note:\n - The function excludes lines that are empty or contain only whitespace.\n - Tabs within the lines are replaced with a single space. For instance, a '\\\\t' character in the input string\n will be replaced by ' ' in the output DataFrame.\n\n Example:\n >>> df = f_893('line a\\\\nfollowed by line b with a\\\\ttab\\\\n\\\\n...bye\\\\n')\n >>> print(df.head())\n Text\n 0 line a\n 1 followed by line b with a tab\n 2 ...bye\n \"\"\"", "canonical_solution": " input_string = input_string.replace('\\\\n', '\\n').replace('\\\\t', ' ')\n # Split the input string into lines and filter out empty lines\n lines = [line for line in input_string.split(\"\\n\") if line.strip()]\n # Replace tabs with spaces in each line\n lines = [re.sub(\"\\t\", \" \", line) for line in lines]\n # Create a DataFrame from the processed lines\n return pd.DataFrame(lines, columns=[\"Text\"])", "test": "import pandas as pd\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_893.\"\"\"\n def test_basic_string(self):\n \"\"\"\n Test with a basic multi-line string.\n \"\"\"\n input_str = \"line1\\nline2 with a\\ttab\\nline3\"\n expected_output = pd.DataFrame({\"Text\": [\"line1\", \"line2 with a tab\", \"line3\"]})\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)\n def test_empty_string(self):\n \"\"\"\n Test with an empty string.\n \"\"\"\n input_str = \"\"\n expected_output = pd.DataFrame(columns=[\"Text\"])\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)\n def test_string_with_empty_lines(self):\n \"\"\"\n Test with a string that contains empty lines.\n \"\"\"\n input_str = \"line1\\n\\nline3\"\n expected_output = pd.DataFrame({\"Text\": [\"line1\", \"line3\"]})\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)\n def test_string_with_only_tabs(self):\n \"\"\"\n Test with a string that contains only tabs.\n \"\"\"\n input_str = \"\\t\\t\\t\"\n expected_output = pd.DataFrame(columns=[\"Text\"])\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)\n def test_string_with_mixed_whitespace(self):\n \"\"\"\n Test with a string that contains a mix of tabs and spaces.\n \"\"\"\n input_str = \"line1\\n \\t \\nline3\"\n expected_output = pd.DataFrame({\"Text\": [\"line1\", \"line3\"]})\n pd.testing.assert_frame_equal(f_893(input_str), expected_output)", "apis": ["re.sub", "pandas.DataFrame"], "libs": ["re", "pandas"], "doc": {"description": ["Process a multi-line string by replacing tabs with spaces and converting it into a pandas DataFrame.", "Each non-empty line of the input string is transformed into a separate row in the DataFrame.", "The function specifically filters out empty lines and replaces tabs with single spaces in the remaining lines."], "note": ["The function excludes lines that are empty or contain only whitespace.", "Tabs within the lines are replaced with a single space. For instance, a '\\\\t' character in the input string", "will be replaced by ' ' in the output DataFrame."], "params": ["input_string (str): A multi-line string. Each line is separated by a newline character ('\\\\n')."], "returns": ["pd.DataFrame: A DataFrame with a single column named 'Text'. Each row in this column corresponds to a non-empty", "line from the input string, with tabs replaced by spaces."], "reqs": ["re", "pandas"], "raises": [], "example": [">>> df = f_893('line a\\\\nfollowed by line b with a\\\\ttab\\\\n\\\\n...bye\\\\n')", ">>> print(df.head())", "Text", "0 line a", "1 followed by line b with a tab", "2 ...bye"]}} +{"task_id": "f_816", "prompt": "import os\nimport shutil\n\n\ndef f_816(path, delimiter=\"/\"):\n \"\"\"\n Splits a given file path by a specific delimiter and computes disk usage for each directory component.\n\n Parameters:\n - path (str): The file path to split.\n - delimiter (str, optional): The delimiter to use for splitting the path. Default is '/'.\n\n Returns:\n list: A list of tuples where each tuple contains a path component and its disk usage as a dictionary.\n The disk usage dictionary contains keys 'total', 'used', and 'free'.\n\n Raises:\n - ValueError: If the 'path' is empty, not a string, or contain invalid components.\n - FileNotFoundError: If the 'path' does not exist in the filesystem.\n\n Requirements:\n - os\n - shutil\n\n Examples:\n >>> f_816('Docs/src', '/')\n [('Docs', {'total': 100, 'used': 50, 'free': 50}), ('src', {'total': 200, 'used': 100, 'free': 100})]\n\n >>> f_816('a/b', '/')\n [('a', {'total': 300, 'used': 150, 'free': 150}), ('b', {'total': 400, 'used': 200, 'free': 200})]\n \"\"\"", "canonical_solution": " if not path or not isinstance(path, str):\n raise ValueError(\"Path must be a non-empty string\")\n if not os.path.exists(path):\n raise FileNotFoundError(f\"Path '{path}' does not exist\")\n\n path_components = path.strip(delimiter).split(delimiter)\n if not all(path_components):\n raise ValueError(\"Path contains invalid components\")\n\n results = []\n for index, component in enumerate(path_components):\n sub_path = delimiter.join(path_components[: index + 1])\n if not sub_path.startswith(delimiter):\n sub_path = delimiter + sub_path\n usage = shutil.disk_usage(sub_path)\n results.append(\n (component, {\"total\": usage.total, \"used\": usage.used, \"free\": usage.free})\n )\n\n return results", "test": "import unittest\nfrom collections import namedtuple\nfrom unittest.mock import patch\nimport tempfile\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n DiskUsage = namedtuple(\"DiskUsage\", [\"total\", \"used\", \"free\"])\n # Setup realistic disk usage values for different directories\n self.mock_usage_root = DiskUsage(500000000000, 300000000000, 200000000000)\n self.mock_usage_docs = DiskUsage(100000000000, 50000000000, 50000000000)\n self.mock_usage_src = DiskUsage(50000000000, 25000000000, 25000000000)\n self.mock_usage_home = DiskUsage(200000000000, 100000000000, 100000000000)\n def disk_usage_side_effect(self, path):\n # Helper for mocking\n if path.endswith(\"src\"):\n return self.mock_usage_src\n elif path.endswith(\"Docs\"):\n return self.mock_usage_docs\n elif path == \"/home\":\n return self.mock_usage_home\n return self.mock_usage_root\n @patch(\"os.path.exists\")\n def test_nonexist_path(self, mock_exists):\n # Test function should raise error if path does not exist\n mock_exists.return_value = True\n with tempfile.TemporaryDirectory() as tmpdirname:\n non_exist_path = os.path.join(tmpdirname, \"nonexist\")\n with self.assertRaises(FileNotFoundError):\n f_816(non_exist_path)\n def test_invalid_path(self):\n # Test function should raise error if path is not valid\n with self.assertRaises(ValueError):\n f_816(\"\")\n with self.assertRaises(ValueError):\n f_816(123)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_varied_path(self, mock_disk_usage, mock_exists):\n # Test functionality\n mock_exists.return_value = True\n mock_disk_usage.side_effect = self.disk_usage_side_effect\n result = f_816(\"Docs/src\")\n expected = [\n (\n \"Docs\",\n {\n \"total\": self.mock_usage_docs.total,\n \"used\": self.mock_usage_docs.used,\n \"free\": self.mock_usage_docs.free,\n },\n ),\n (\n \"src\",\n {\n \"total\": self.mock_usage_src.total,\n \"used\": self.mock_usage_src.used,\n \"free\": self.mock_usage_src.free,\n },\n ),\n ]\n self.assertEqual(result, expected)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_deep_nested_path(self, mock_disk_usage, mock_exists):\n # Test nested paths\n mock_exists.return_value = True\n mock_disk_usage.return_value = self.mock_usage_src\n deep_path = \"Docs/src/Projects/Python/Example\"\n result = f_816(deep_path)\n expected = [\n (\"Docs\", self.mock_usage_src._asdict()),\n (\"src\", self.mock_usage_src._asdict()),\n (\"Projects\", self.mock_usage_src._asdict()),\n (\"Python\", self.mock_usage_src._asdict()),\n (\"Example\", self.mock_usage_src._asdict()),\n ]\n self.assertEqual(result, expected)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_single_directory(self, mock_disk_usage, mock_exists):\n # Test function works on single directory\n mock_exists.return_value = True\n mock_disk_usage.return_value = self.mock_usage_home\n result = f_816(\"home\")\n expected = [(\"home\", self.mock_usage_home._asdict())]\n self.assertEqual(result, expected)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_path_with_multiple_delimiters(self, mock_disk_usage, mock_exists):\n # Test should fail if there is an invalid path component\n mock_exists.return_value = True\n mock_disk_usage.side_effect = lambda path: {\n \"/Docs\": self.mock_usage_docs,\n \"/Docs/src\": self.mock_usage_src,\n }.get(path, self.mock_usage_root)\n with self.assertRaises(ValueError):\n result = f_816(\"Docs//src\")\n expected = [\n (\"Docs\", self.mock_usage_docs._asdict()),\n (\"\", {\"total\": 0, \"used\": 0, \"free\": 0}),\n (\"src\", self.mock_usage_src._asdict()),\n ]\n self.assertEqual(result, expected)\n @patch(\"os.path.exists\")\n @patch(\"shutil.disk_usage\")\n def test_path_with_trailing_delimiter(self, mock_disk_usage, mock_exists):\n # Test should handle trailing delimiter\n mock_exists.return_value = True\n mock_disk_usage.side_effect = lambda path: {\n \"/Docs\": self.mock_usage_docs,\n \"/Docs/src\": self.mock_usage_src,\n }.get(path, self.mock_usage_root)\n result = f_816(\"Docs/src/\")\n expected = [\n (\"Docs\", self.mock_usage_docs._asdict()),\n (\"src\", self.mock_usage_src._asdict()),\n ]\n self.assertEqual(result, expected)", "apis": ["os.path", "shutil.disk_usage", "os.path.exists"], "libs": ["os", "shutil"], "doc": {"description": ["Splits a given file path by a specific delimiter and computes disk usage for each directory component.", ">>> f_816('a/b', '/')", "[('a', {'total': 300, 'used': 150, 'free': 150}), ('b', {'total': 400, 'used': 200, 'free': 200})]"], "note": [], "params": ["path (str): The file path to split.", "delimiter (str, optional): The delimiter to use for splitting the path. Default is '/'."], "returns": ["list: A list of tuples where each tuple contains a path component and its disk usage as a dictionary.", "The disk usage dictionary contains keys 'total', 'used', and 'free'."], "reqs": ["os", "shutil"], "raises": ["ValueError: If the 'path' is empty, not a string, or contain invalid components.", "FileNotFoundError: If the 'path' does not exist in the filesystem."], "example": ["Examples:", ">>> f_816('Docs/src', '/')", "[('Docs', {'total': 100, 'used': 50, 'free': 50}), ('src', {'total': 200, 'used': 100, 'free': 100})]"]}} +{"task_id": "f_347", "prompt": "import numpy as np\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\n\n\ndef f_347(P, T, tensor_shape=(3, 3, 3)):\n \"\"\"\n Calculate the product of a matrix \"P\" and a 3D tensor \"T\" with numpy and then apply PCA to reduce the\n dimensionality of the result. The resulting 2D data is then visualized.\n Note: This function only accepts numpy matrices/arrays.\n\n Parameters:\n P (numpy.ndarray): The input matrix.\n T (numpy.ndarray): The input tensor. Must have same shape as tensor_shape.\n tensor_shape (tuple, optional): The shape of the tensor. Must be same as T.shape. Default is (3, 3, 3).\n\n Returns:\n pca_result (numpy.ndarray): The result of PCA of shape (N, 2), where N is the number of rows in matrix P.\n ax (matplotlib.axes.Axes): Plot of 'PCA Result Visualization', with 'Principal Component 1' on the x-axis\n and 'Principal Component 2' on the y-axis.\n\n\n\n Requirements:\n - numpy\n - sklearn.decomposition\n - matplotlib.pyplot\n\n Example:\n >>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])\n >>> T = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]])\n >>> pca_result, ax = f_347(P, T)\n >>> pca_result.shape\n (3, 2)\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " if not (isinstance(P, np.ndarray) and isinstance(T, np.ndarray)):\n raise TypeError(\"Expected inputs to be numpy arrays\")\n\n if not T.shape == tensor_shape:\n raise ValueError(\"Provided tensor does not match the specified tensor_shape.\")\n\n result = np.tensordot(P, T, axes=[1, 1]).swapaxes(0, 1)\n\n # Reshape the result for PCA\n result = result.reshape(result.shape[0], -1)\n pca = PCA(n_components=2)\n pca_result = pca.fit_transform(result)\n\n fig, ax = plt.subplots()\n ax.scatter(pca_result[:, 0], pca_result[:, 1])\n ax.set_title(\"PCA Result Visualization\")\n ax.set_xlabel(\"Principal Component 1\")\n ax.set_ylabel(\"Principal Component 2\")\n\n return pca_result, ax", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(0)\n # Set up common matrices and tensors for testing\n self.TENSOR_SHAPE = (3, 3, 3)\n self.P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1]])\n self.T = np.random.rand(*self.TENSOR_SHAPE)\n self.T_zeros = np.zeros(self.TENSOR_SHAPE)\n self.T_ones = np.ones(self.TENSOR_SHAPE)\n def test_case_1(self):\n # Test results and plot correctness\n pca_result, ax = f_347(self.P, self.T)\n self._common_assertions(pca_result, ax)\n def test_case_2(self):\n # Function should fail when input types are invalid\n with self.assertRaises(Exception):\n f_347(\"not a numpy array\", self.T, self.TENSOR_SHAPE)\n with self.assertRaises(Exception):\n f_347(self.P, \"not a numpy array\", self.TENSOR_SHAPE)\n with self.assertRaises(Exception):\n f_347([], [], self.TENSOR_SHAPE)\n def test_case_3(self):\n # Function should fail when input shapes are invalid\n T_incorrect_shape = np.random.rand(2, 2, 2)\n with self.assertRaises(Exception):\n f_347(self.P, T_incorrect_shape, self.TENSOR_SHAPE)\n with self.assertRaises(Exception):\n f_347(np.array([]), np.array([]), self.TENSOR_SHAPE)\n def test_case_4(self):\n # Test custom shapes\n P = np.random.rand(5, 4)\n T = np.random.rand(5, 4, 4)\n pca_result, ax = f_347(P, T, tensor_shape=T.shape)\n self._common_assertions(pca_result, ax)\n def test_case_5(self):\n # Test with zeros\n pca_result, ax = f_347(self.P, self.T_zeros)\n self._common_assertions(pca_result, ax)\n def test_case_6(self):\n # Adjusting the matrix and tensor to have a slight variation\n P = np.array([[1.01, 0.01, 0.01], [0.01, 1.01, 0.01], [0.01, 0.01, 1.01]])\n T = np.ones(self.TENSOR_SHAPE) + 0.01 * np.random.rand(*self.TENSOR_SHAPE)\n pca_result, ax = f_347(P, T)\n # Assert that the PCA results don't produce NaN values and that there's a reduction in dimensionality\n self.assertFalse(np.isnan(pca_result).any())\n self.assertEqual(pca_result.shape[1], 2)\n # Also check common assertions\n self._common_assertions(pca_result, ax)\n def _common_assertions(self, pca_result, ax):\n # Common assertions for shape and plot labels\n self.assertEqual(pca_result.shape[1], 2)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"PCA Result Visualization\")\n self.assertEqual(ax.get_xlabel(), \"Principal Component 1\")\n self.assertEqual(ax.get_ylabel(), \"Principal Component 2\")\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "sklearn.decomposition.PCA", "numpy.ndarray", "numpy.tensordot"], "libs": ["sklearn", "numpy", "matplotlib"], "doc": {"description": ["Calculate the product of a matrix \"P\" and a 3D tensor \"T\" with numpy and then apply PCA to reduce the", "dimensionality of the result. The resulting 2D data is then visualized."], "note": ["This function only accepts numpy matrices/arrays."], "params": ["P (numpy.ndarray): The input matrix.", "T (numpy.ndarray): The input tensor. Must have same shape as tensor_shape.", "tensor_shape (tuple, optional): The shape of the tensor. Must be same as T.shape. Default is (3, 3, 3)."], "returns": ["pca_result (numpy.ndarray): The result of PCA of shape (N, 2), where N is the number of rows in matrix P.", "ax (matplotlib.axes.Axes): Plot of 'PCA Result Visualization', with 'Principal Component 1' on the x-axis", "and 'Principal Component 2' on the y-axis."], "reqs": ["numpy", "sklearn.decomposition", "matplotlib.pyplot"], "raises": [], "example": [">>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])", ">>> T = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]])", ">>> pca_result, ax = f_347(P, T)", ">>> pca_result.shape", "(3, 2)", ">>> type(ax)", ""]}} +{"task_id": "f_398", "prompt": "import pandas as pd\nimport numpy as np\n\n\ndef f_398(column, data):\n \"\"\"\n Analyzes a list of stock data and calculates the sum, mean, minimum, and maximum\n values for a specified column.\n\n Parameters:\n - column (str): The name of the column to analyze. Valid options are 'Date', 'Open', 'High',\n 'Low', 'Close', and 'Volume'.\n - data (list of lists): A list where each element is a list representing stock data for a single day.\n Each inner list should contain values in the following order:\n 'Date', 'Open', 'High', 'Low', 'Close', 'Volume'.\n Function will raise ValueError if the structure is not as expected.\n Returns:\n - dict: A dictionary containing the calculated 'sum', 'mean', 'min' (minimum), and 'max' (maximum)\n for the specified column. If the input data is empty, 'sum' will be 0, and 'mean', 'min', and\n 'max' will be NaN.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]]\n >>> results = f_398('Open', data)\n >>> results\n {'sum': 100, 'mean': 100.0, 'min': 100, 'max': 100}\n >>> type(results)\n \n \"\"\"", "canonical_solution": " valid_columns = [\"Date\", \"Open\", \"High\", \"Low\", \"Close\", \"Volume\"]\n if column not in valid_columns:\n raise ValueError(f\"Invalid column name.\")\n if not isinstance(data, list) or (\n len(data) > 0\n and not all(\n isinstance(row, list) and len(row) == len(valid_columns) for row in data\n )\n ):\n raise ValueError(\n \"Data must be a list of lists, with each inner list matching the length of the column names.\"\n )\n\n df = pd.DataFrame(data, columns=valid_columns)\n column_data = df[column]\n\n result = {\n \"sum\": np.sum(column_data) if not column_data.empty else 0,\n \"mean\": np.mean(column_data) if not column_data.empty else float(\"nan\"),\n \"min\": np.min(column_data) if not column_data.empty else float(\"nan\"),\n \"max\": np.max(column_data) if not column_data.empty else float(\"nan\"),\n }\n\n return result", "test": "import unittest\nimport numpy as np\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n def assertDictAlmostEqual(self, d1, d2, msg=None):\n # Helper function for testing\n for k, v in d1.items():\n if isinstance(v, float) and np.isnan(v):\n self.assertTrue(np.isnan(d2[k]), msg or f\"{k} not almost equal\")\n else:\n self.assertAlmostEqual(v, d2[k], msg=msg or f\"{k} not equal\")\n def test_case_1(self):\n # Test with valid data for a specific column\n data = [\n [datetime(2022, 1, 1), 100, 105, 95, 102, 10000],\n [datetime(2022, 1, 2), 102, 108, 100, 105, 15000],\n [datetime(2022, 1, 3), 105, 110, 103, 108, 20000],\n ]\n result = f_398(\"Open\", data)\n expected_result = {\n \"sum\": 307,\n \"mean\": 102.33333333333333,\n \"min\": 100,\n \"max\": 105,\n }\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_2(self):\n # Test with empty data list\n data = []\n result = f_398(\"Open\", data)\n expected_result = {\n \"sum\": 0,\n \"mean\": float(\"nan\"),\n \"min\": float(\"nan\"),\n \"max\": float(\"nan\"),\n }\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_3(self):\n # Test with an invalid column name\n data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]]\n with self.assertRaises(ValueError):\n f_398(\"InvalidColumn\", data)\n def test_case_4(self):\n # Test with NaN values in the target column\n data = [\n [datetime(2022, 1, 1), np.nan, 105, 95, 102, 10000],\n [datetime(2022, 1, 2), 102, np.nan, 100, 105, 15000],\n [datetime(2022, 1, 3), 105, np.nan, 103, 108, 20000],\n ]\n result = f_398(\"Open\", data)\n expected_result = {\"sum\": 207, \"mean\": 103.5, \"min\": 102, \"max\": 105}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_5(self):\n # Test with all values in the target column being the same\n data = [[datetime(2022, 1, 1), 100, 100, 100, 100, 10000]] * 3\n result = f_398(\"Open\", data)\n expected_result = {\"sum\": 300, \"mean\": 100, \"min\": 100, \"max\": 100}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_6(self):\n # Test for handling mixed data types within a single column\n data = [\n [datetime(2022, 1, 1), 100, 105, 95, 102, 10000],\n [datetime(2022, 1, 2), \"102\", 108, 100, 105, 15000],\n ]\n with self.assertRaises(TypeError):\n f_398(\"Open\", data)\n def test_case_7(self):\n # Test with extremely large values in the target column\n data = [[datetime(2022, 1, 1), 1e18, 1.05e18, 0.95e18, 1.02e18, 10000]]\n result = f_398(\"Open\", data)\n expected_result = {\"sum\": 1e18, \"mean\": 1e18, \"min\": 1e18, \"max\": 1e18}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_8(self):\n # Test with a single row of data\n data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]]\n result = f_398(\"Open\", data)\n expected_result = {\"sum\": 100, \"mean\": 100, \"min\": 100, \"max\": 100}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_9(self):\n # Test with a very large dataset to check performance/scalability\n large_data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]] * 10000\n result = f_398(\"Open\", large_data)\n expected_result = {\"sum\": 1000000, \"mean\": 100, \"min\": 100, \"max\": 100}\n self.assertDictAlmostEqual(result, expected_result)\n def test_case_10(self):\n # Test for column case sensitivity\n data = [\n [datetime(2022, 1, 1), 100, 105, 95, 102, 10000],\n ]\n with self.assertRaises(ValueError):\n f_398(\"open\", data)\n def test_case_11(self):\n # Test with incorrect data\n data = \"Incorrect data type\"\n with self.assertRaises(ValueError):\n f_398(\"Open\", data)\n def test_case_12(self):\n # Test for data list containing lists of varying lengths\n data = [\n [datetime(2022, 1, 1), 100, 105, 95, 102, 10000],\n [datetime(2022, 1, 2), 102, 108, 100],\n ]\n with self.assertRaises(ValueError):\n f_398(\"Open\", data)\n def test_case_13(self):\n # Test for data list containing elements other than lists (mixed types)\n data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000], \"Not a list\"]\n with self.assertRaises(ValueError):\n f_398(\"Open\", data)\n def test_case_14(self):\n # Test for a correctly structured and typed data list but with an empty inner list\n data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000], []]\n with self.assertRaises(ValueError):\n f_398(\"Open\", data)", "apis": ["numpy.sum", "pandas.DataFrame", "numpy.mean", "numpy.min", "numpy.max"], "libs": ["pandas", "numpy"], "doc": {"description": ["Analyzes a list of stock data and calculates the sum, mean, minimum, and maximum", "values for a specified column."], "note": [], "params": ["column (str): The name of the column to analyze. Valid options are 'Date', 'Open', 'High',", "'Low', 'Close', and 'Volume'.", "data (list of lists): A list where each element is a list representing stock data for a single day.", "Each inner list should contain values in the following order:", "'Date', 'Open', 'High', 'Low', 'Close', 'Volume'.", "Function will raise ValueError if the structure is not as expected."], "returns": ["dict: A dictionary containing the calculated 'sum', 'mean', 'min' (minimum), and 'max' (maximum)", "for the specified column. If the input data is empty, 'sum' will be 0, and 'mean', 'min', and", "'max' will be NaN."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> data = [[datetime(2022, 1, 1), 100, 105, 95, 102, 10000]]", ">>> results = f_398('Open', data)", ">>> results", "{'sum': 100, 'mean': 100.0, 'min': 100, 'max': 100}", ">>> type(results)", ""]}} +{"task_id": "f_839", "prompt": "import requests\nimport json\nfrom bs4 import BeautifulSoup\n\n\ndef f_839(url: str, file_name: str = \"Output.txt\") -> str:\n \"\"\"\n Scrape the title from a specified web page, save it in JSON format to a given file, \n and append to the file if it exists.\n\n Parameters:\n - url (str): The URL of the web page from which the title is to be scraped.\n - file_name (str, optional): The name of the file to save the scraped title. \n If the file already exists, the new data is appended. Defaults to 'Output.txt'.\n\n Returns:\n - str: The file path where the scraped title is saved.\n\n Requirements:\n - requests\n - json\n - bs4\n\n Notes:\n - If the web page does not have a title, 'None' is saved as the title value in the JSON data.\n - Data is appended to the specified file in JSON format, with each title on a new line.\n\n Example:\n >>> f_839(\"http://example.com\")\n 'Output.txt'\n >>> f_839(\"http://another-example.com\", \"AnotherOutput.txt\")\n 'AnotherOutput.txt'\n \"\"\"", "canonical_solution": " response = requests.get(url, timeout=5)\n soup = BeautifulSoup(response.text, \"html.parser\")\n title = soup.title.string if soup.title else None\n data = {\"title\": title}\n json_data = json.dumps(data)\n with open(file_name, \"a\", encoding=\"utf-8\") as f:\n f.write(json_data + \"\\n\")\n return file_name", "test": "import unittest\nfrom unittest.mock import patch, mock_open\nimport requests\nimport json\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_839\"\"\"\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_scrape_title_page_1(self, mock_file):\n \"\"\"Test that the title is scraped from a web page and saved to a file\"\"\"\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = b\"Test Page 1\"\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\")\n self.assertEqual(file_path, \"Output.txt\")\n mock_file().write.assert_called_once_with(\n json.dumps({\"title\": \"Test Page 1\"}) + \"\\n\"\n )\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_scrape_title_page_2(self, mock_file):\n \"\"\"Test that the title is scraped from a web page and saved to a file\"\"\"\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = b\"Test Page 2\"\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\", \"AnotherOutput.txt\")\n self.assertEqual(file_path, \"AnotherOutput.txt\")\n mock_file().write.assert_called_once_with(\n json.dumps({\"title\": \"Test Page 2\"}) + \"\\n\"\n )\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_invalid_url(self, mock_file):\n \"\"\"Test that an exception is raised when the URL is invalid\"\"\"\n with self.assertRaises(requests.RequestException):\n f_839(\"http://invalid-url\")\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_page_without_title(self, mock_file):\n \"\"\"Test that 'None' is saved as the title when the web page does not have a title\"\"\"\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = b\"\"\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\")\n self.assertEqual(file_path, \"Output.txt\")\n mock_file().write.assert_called_once_with(\n json.dumps({\"title\": None}) + \"\\n\"\n )\n @patch(\"builtins.open\", new_callable=mock_open, read_data=\"\")\n def test_very_long_title(self, mock_file):\n \"\"\"Test that a very long title is saved correctly\"\"\"\n long_title = \"A\" * 1024 # A very long title of 1024 characters\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = f\"{long_title}\".encode()\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\")\n self.assertEqual(file_path, \"Output.txt\")\n mock_file().write.assert_called_once_with(\n json.dumps({\"title\": long_title}) + \"\\n\"\n )\n @patch(\n \"builtins.open\",\n new_callable=mock_open,\n read_data=json.dumps({\"title\": \"Existing Title\"}) + \"\\n\",\n )\n def test_append_to_existing_file(self, mock_file):\n \"\"\"Test that data is appended to an existing file\"\"\"\n mock_response = requests.Response()\n mock_response.status_code = 200\n mock_response._content = b\"New Title\"\n with patch(\"requests.get\", return_value=mock_response):\n file_path = f_839(\"http://example.com\")\n self.assertEqual(file_path, \"Output.txt\")\n mock_file().write.assert_called_with(\n json.dumps({\"title\": \"New Title\"}) + \"\\n\"\n )", "apis": ["json.dumps", "bs4.BeautifulSoup", "requests.get"], "libs": ["bs4", "json", "requests"], "doc": {"description": ["Scrape the title from a specified web page, save it in JSON format to a given file,", "and append to the file if it exists.", "Notes:", "- If the web page does not have a title, 'None' is saved as the title value in the JSON data.", "- Data is appended to the specified file in JSON format, with each title on a new line."], "note": [], "params": ["url (str): The URL of the web page from which the title is to be scraped.", "file_name (str, optional): The name of the file to save the scraped title.", "If the file already exists, the new data is appended. Defaults to 'Output.txt'."], "returns": ["str: The file path where the scraped title is saved."], "reqs": ["requests", "json", "bs4"], "raises": [], "example": [">>> f_839(\"http://example.com\")", "'Output.txt'", ">>> f_839(\"http://another-example.com\", \"AnotherOutput.txt\")", "'AnotherOutput.txt'"]}} +{"task_id": "f_376", "prompt": "import pandas as pd\nimport re\nimport random\n\n\ndef f_376(data_list, seed=None):\n \"\"\"\n Removes a random comma-separated value (treated as a \"substring\") from each string\n in a list and returns a pandas DataFrame containing the original and modified strings.\n\n Parameters:\n - data_list (list of str): A list of comma-separated strings. The function will remove\n leading and trailing whitespaces first before processing.\n - seed (int, optional): Seed for the random number generator for reproducibility.\n Default is None, which uses system time.\n\n Returns:\n - DataFrame: A pandas DataFrame with columns 'Original String' and 'Modified String'.\n\n Requirements:\n - pandas\n - re\n - random\n\n Example:\n >>> f_376(['lamp, bag, mirror', 'table, chair, bag, lamp'], seed=42)\n Original String Modified String\n 0 lamp, bag, mirror lamp, bag\n 1 table, chair, bag, lamp chair, bag, lamp\n \"\"\"", "canonical_solution": " if seed is not None:\n random.seed(seed)\n\n df = pd.DataFrame([s.strip() for s in data_list], columns=[\"Original String\"])\n\n modified_strings = []\n for s in data_list:\n substrings = re.split(\", \", s)\n random_substring = random.choice(substrings)\n modified_s = (\n s.replace(\", \" + random_substring, \"\")\n if \", \" + random_substring in s\n else s.replace(random_substring + \", \", \"\")\n )\n modified_strings.append(modified_s)\n\n df[\"Modified String\"] = modified_strings\n\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.columns = [\"Original String\", \"Modified String\"]\n def test_case_1(self):\n # Test basic case\n input_data = [\"apple, orange, banana\", \"car, bike, plane\"]\n result = f_376(input_data, seed=42)\n self._test_dataframe(result, input_data)\n def test_case_2(self):\n # Test single character\n input_data = [\"a, b, c, d, e\", \"f, g, h, i, j\"]\n result = f_376(input_data, seed=42)\n self._test_dataframe(result, input_data)\n def test_case_3(self):\n # Test single numeric characters\n input_data = [\"1, 2, 3\", \"4, 5, 6, 7\"]\n result = f_376(input_data, seed=42)\n self._test_dataframe(result, input_data)\n def test_case_4(self):\n # Test with an empty list\n input_data = []\n result = f_376(input_data, seed=42)\n self.assertTrue(result.empty)\n def test_case_5(self):\n # Test with strings without commas\n input_data = [\"apple\", \"car\"]\n result = f_376(input_data, seed=42)\n # Ensure dataframe has correct columns\n self.assertListEqual(list(result.columns), self.columns)\n # Ensure 'Modified String' is the same as 'Original String' for single values\n for orig, mod in zip(result[\"Original String\"], result[\"Modified String\"]):\n self.assertEqual(orig.strip(), mod)\n def test_case_6(self):\n # Test strings with leading and trailing spaces\n input_data = [\" apple, orange, banana \", \" car, bike, plane\"]\n expected_data = [\"apple, orange, banana\", \"car, bike, plane\"]\n result = f_376(input_data, seed=42)\n self._test_dataframe(result, expected_data)\n def test_case_7(self):\n # Test strings where the same value appears multiple times\n input_data = [\"apple, apple, banana\", \"car, car, bike, plane\"]\n result = f_376(input_data, seed=42)\n # Special case where substrings might be duplicated\n for orig, mod in zip(result[\"Original String\"], result[\"Modified String\"]):\n diff = len(orig.split(\", \")) - len(mod.split(\", \"))\n self.assertTrue(diff in [0, 1]) # Either no change or one substring removed\n def test_case_8(self):\n # Test reproducibility with the same seed\n input_data = [\"apple, orange, banana\", \"car, bike, plane\"]\n result1 = f_376(input_data, seed=42)\n result2 = f_376(input_data, seed=42)\n pd.testing.assert_frame_equal(result1, result2)\n def test_case_9(self):\n # Test difference with different seeds\n input_data = [\"apple, orange, banana\", \"car, bike, plane\"]\n result1 = f_376(input_data, seed=42)\n result2 = f_376(input_data, seed=43)\n self.assertFalse(result1.equals(result2))\n def _test_dataframe(self, df, input_data):\n # Ensure dataframe has correct columns\n self.assertListEqual(list(df.columns), self.columns)\n # Ensure 'Modified String' has one less substring than 'Original String'\n for orig, mod in zip(df[\"Original String\"], df[\"Modified String\"]):\n self.assertTrue(orig in input_data) # Ensure original string is from input\n self.assertEqual(len(orig.split(\", \")) - 1, len(mod.split(\", \")))", "apis": ["pandas.DataFrame", "re.split", "random.seed", "random.choice"], "libs": ["random", "re", "pandas"], "doc": {"description": ["Removes a random comma-separated value (treated as a \"substring\") from each string", "in a list and returns a pandas DataFrame containing the original and modified strings."], "note": [], "params": ["data_list (list of str): A list of comma-separated strings. The function will remove", "leading and trailing whitespaces first before processing.", "seed (int, optional): Seed for the random number generator for reproducibility.", "Default is None, which uses system time."], "returns": ["DataFrame: A pandas DataFrame with columns 'Original String' and 'Modified String'."], "reqs": ["pandas", "re", "random"], "raises": [], "example": [">>> f_376(['lamp, bag, mirror', 'table, chair, bag, lamp'], seed=42)", "Original String Modified String", "0 lamp, bag, mirror lamp, bag", "1 table, chair, bag, lamp chair, bag, lamp"]}} +{"task_id": "f_855", "prompt": "import requests\nfrom pathlib import Path\nimport zipfile\n\n# Constants\nDOWNLOAD_DIR = Path(\"downloads\")\nZIP_DIR = Path(\"unzipped_files\")\n\n\ndef f_855(url, filename):\n \"\"\"\n Downloads and extracts a zip file from a specified URL.\n\n Parameters:\n url (str): The URL of the zip file to download.\n filename (str): The filename under which the downloaded zip file will be saved.\n\n Returns:\n tuple: A tuple containing a status message and a list of filenames in the unzipped directory, or an empty list if extraction fails.\n\n Note:\n the status message will contain \"Error\" when:\n - Network-related exceptions are raised if the download fails.\n - File-related exceptions are raised if there is an issue with file handling or extraction.\n\n Requirements:\n - requests\n - pathlib.Path\n - zipfile\n\n Example:\n >>> f_855('http://example.com/myfile.zip', 'myfile.zip')\n ('Download and extraction successful', ['file1.txt', 'file2.txt'])\n \"\"\"", "canonical_solution": " try:\n # Download the file\n response = requests.get(url, stream=True, timeout=5)\n if response.status_code == 200:\n filepath = DOWNLOAD_DIR / filename\n filepath.parent.mkdir(parents=True, exist_ok=True)\n\n with open(filepath, \"wb\") as handle:\n for data in response.iter_content():\n handle.write(data)\n\n # Unzip the file\n zip_dir = ZIP_DIR / filename[:-4]\n zip_dir.mkdir(parents=True, exist_ok=True)\n\n with zipfile.ZipFile(filepath, \"r\") as zip_ref:\n zip_ref.extractall(zip_dir)\n\n return \"Download and extraction successful\", [\n file.name for file in zip_dir.iterdir()\n ]\n return (\n f\"Download failed: HTTP status code {response.status_code}\",\n [],\n )\n except requests.exceptions.RequestException as e:\n return f\"Error: {e}\", []\n except zipfile.BadZipFile as e:\n return f\"Error: Invalid zip file: {e}\", []", "test": "import unittest\nfrom unittest.mock import MagicMock, patch\nimport shutil\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_855.\"\"\"\n def test_successful_download_and_extraction(self):\n \"\"\"Test a successful download and extraction.\"\"\"\n result = f_855(\n \"https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-zip-file.zip\",\n \"test.zip\",\n )\n self.assertIn(\"Download and extraction successful\", result[0])\n self.assertTrue(len(result[1]) > 0)\n @patch(\"requests.get\")\n def test_invalid_url(self, mock_get):\n \"\"\"Test an invalid URL.\"\"\"\n mock_get.return_value.status_code = 404\n result = f_855(\"http://invalidurl.com/file.zip\", \"test.zip\")\n self.assertIn(\"Download failed\", result[0])\n self.assertEqual(result[1], [])\n @patch(\"requests.get\")\n def test_non_200_http_response(self, mock_get):\n \"\"\"Test a non-200 HTTP response.\"\"\"\n mock_get.return_value.status_code = 404\n result = f_855(\"http://example.com/file.zip\", \"test.zip\")\n self.assertIn(\"Download failed\", result[0])\n self.assertEqual(result[1], [])\n @patch(\"requests.get\")\n def test_network_error(self, mock_get):\n \"\"\"Test a network error.\"\"\"\n mock_get.side_effect = requests.exceptions.ConnectionError\n result = f_855(\"http://example.com/file.zip\", \"test.zip\")\n self.assertIn(\"Error\", result[0])\n self.assertEqual(result[1], [])\n @patch(\"builtins.open\", new_callable=MagicMock)\n @patch(\"requests.get\")\n @patch(\"zipfile.ZipFile\")\n def test_corrupted_zip_file(self, mock_zip, mock_get, mock_open):\n \"\"\"Test a corrupted zip file.\"\"\"\n # Mock the response to simulate a successful download\n mock_response = MagicMock()\n mock_response.status_code = 200\n mock_response.iter_content = MagicMock(return_value=[b\"data\"])\n mock_get.return_value = mock_response\n # Mock the zipfile to raise a BadZipFile exception\n mock_zip.side_effect = zipfile.BadZipFile\n # Run the function\n result = f_855(\"http://example.com/corrupted.zip\", \"corrupted.zip\")\n # Check that the result indicates an error related to zip file extraction\n self.assertIn(\"Error\", result[0])\n self.assertIsInstance(result[1], list)\n self.assertEqual(len(result[1]), 0)\n @patch(\"requests.get\")\n def test_request_exception(self, mock_get):\n \"\"\"Test a network error.\"\"\"\n # Mock the requests.get to raise a RequestException\n mock_get.side_effect = requests.exceptions.RequestException\n # Run the function with a sample URL and filename\n result = f_855(\"http://example.com/file.zip\", \"test.zip\")\n # Check that the result indicates an error related to the network request\n self.assertIn(\"Error\", result[0])\n self.assertIsInstance(result[1], list)\n self.assertEqual(len(result[1]), 0)\n def tearDown(self):\n shutil.rmtree(DOWNLOAD_DIR, ignore_errors=True)\n shutil.rmtree(ZIP_DIR, ignore_errors=True)", "apis": ["zipfile.BadZipFile", "zipfile.ZipFile", "requests.exceptions", "requests.get", "pathlib.Path"], "libs": ["zipfile", "pathlib", "requests"], "doc": {"description": ["Downloads and extracts a zip file from a specified URL."], "note": ["the status message will contain \"Error\" when:", "Network-related exceptions are raised if the download fails.", "File-related exceptions are raised if there is an issue with file handling or extraction."], "params": ["url (str): The URL of the zip file to download.", "filename (str): The filename under which the downloaded zip file will be saved."], "returns": ["tuple: A tuple containing a status message and a list of filenames in the unzipped directory, or an empty list if extraction fails."], "reqs": ["requests", "pathlib.Path", "zipfile"], "raises": [], "example": [">>> f_855('http://example.com/myfile.zip', 'myfile.zip')", "('Download and extraction successful', ['file1.txt', 'file2.txt'])"]}} +{"task_id": "f_589", "prompt": "import pandas as pd\nfrom itertools import combinations\n\n# Constants\nMIN_PERCENTAGE = 0.75\n\ndef f_589(data, cols, percentage):\n \"\"\"\n Find all combinations of columns from a given DataFrame so that the absolute correlation between them is greater than a certain threshold.\n\n Parameters:\n - data (list): List of lists with the data, where the length of the inner list equals the number of columns\n - cols (list): List of column names\n - percentage (float): The threshold for the absolute correlation.\n\n Returns:\n - corr_combinations (list): A list of tuples where each tuple contains two column names.\n\n Requirements:\n - pandas\n - itertools\n\n Example:\n >>> result = f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.9)\n >>> print(result)\n [('x', 'y')]\n \"\"\"", "canonical_solution": " if not 0 <= percentage <= 1:\n raise ValueError('Percentage must be between 0 and 1')\n df = pd.DataFrame(data, columns=cols)\n corr_matrix = df.corr().abs()\n columns = corr_matrix.columns\n corr_combinations = []\n\n for col1, col2 in combinations(columns, 2):\n if corr_matrix.loc[col1, col2] > percentage:\n corr_combinations.append((col1, col2))\n\n return corr_combinations", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.9), [('x', 'y')])\n def test_case_2(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.5), [('x', 'y'), ('x', 'z'), ('y', 'z')])\n def test_case_3(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.1), [('x', 'y'), ('x', 'z'), ('y', 'z')])\n def test_case_4(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.0), [('x', 'y'), ('x', 'z'), ('y', 'z')])\n def test_case_5(self):\n self.assertEqual(f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 1.0), [])", "apis": ["pandas.DataFrame", "itertools.combinations"], "libs": ["itertools", "pandas"], "doc": {"description": ["Find all combinations of columns from a given DataFrame so that the absolute correlation between them is greater than a certain threshold."], "note": [], "params": ["data (list): List of lists with the data, where the length of the inner list equals the number of columns", "cols (list): List of column names", "percentage (float): The threshold for the absolute correlation."], "returns": ["corr_combinations (list): A list of tuples where each tuple contains two column names."], "reqs": ["pandas", "itertools"], "raises": [], "example": [">>> result = f_589([[5.1, 5.0, 1.4], [4.9, 4.8, 1.4], [4.7, 4.6, 2.0]], ['x', 'y', 'z'], 0.9)", ">>> print(result)", "[('x', 'y')]"]}} +{"task_id": "f_770", "prompt": "from collections import Counter\nimport itertools\nimport string\n\n\ndef f_770(word: str) -> dict:\n \"\"\"\n Create a dictionary containing all possible two-letter combinations of the lowercase English alphabets. \n The dictionary values represent the frequency of these two-letter combinations in the given word.\n If a combination does not appear in the word, its value will be 0.\n\n Requirements:\n - collections.Counter\n - itertools\n - string\n \n Parameters:\n - word (str): The input string containing alphabetic characters.\n\n Returns:\n - dict: A dictionary with keys as two-letter alphabet combinations and values as their counts in the word.\n\n Requirements:\n - The function uses the `collections.Counter` library to count the occurrences of two-letter combinations.\n - The function uses the `itertools.permutations` method to generate all two-letter combinations of alphabets.\n - The function uses the `string` library to get a string of lowercase alphabets.\n\n Example:\n >>> list(f_770('abcdef').items())[:5]\n [('ab', 1), ('ac', 0), ('ad', 0), ('ae', 0), ('af', 0)]\n \"\"\"", "canonical_solution": " ALPHABETS = string.ascii_lowercase\n # Generate all two-letter combinations of alphabets\n permutations = [''.join(x) for x in itertools.permutations(ALPHABETS, 2)]\n combinations = permutations + [x*2 for x in ALPHABETS]\n \n # Generate all two-letter combinations in the word\n word_combinations = [''.join(x) for x in zip(word, word[1:])]\n # Count the occurrences of each two-letter combination in the word\n word_counter = Counter(word_combinations)\n\n # Create the dictionary with the counts\n return {key: word_counter.get(key, 0) for key in combinations}", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n result = f_770('abcdef')\n self.assertEqual(result['ab'], 1)\n self.assertEqual(result['ac'], 0)\n self.assertEqual(result['bc'], 1)\n self.assertEqual(result['cb'], 0)\n self.assertEqual(result['zz'], 0)\n \n def test_case_2(self):\n result = f_770('aabbcc')\n self.assertEqual(result['aa'], 1)\n self.assertEqual(result['ab'], 1)\n self.assertEqual(result['ba'], 0)\n self.assertEqual(result['bb'], 1)\n self.assertEqual(result['bc'], 1)\n \n def test_case_3(self):\n result = f_770('fedcba')\n self.assertEqual(result['fe'], 1)\n self.assertEqual(result['ef'], 0)\n self.assertEqual(result['dc'], 1)\n self.assertEqual(result['ba'], 1)\n self.assertEqual(result['zz'], 0)\n def test_case_4(self):\n result = f_770('cadbfe')\n self.assertEqual(result['ca'], 1)\n self.assertEqual(result['ad'], 1)\n self.assertEqual(result['db'], 1)\n self.assertEqual(result['fe'], 1)\n self.assertEqual(result['zz'], 0)\n def test_case_5(self):\n result = f_770('')\n self.assertEqual(result['ab'], 0)\n self.assertEqual(result['zz'], 0)", "apis": ["collections.Counter", "string.ascii_lowercase", "itertools.permutations"], "libs": ["itertools", "collections", "string"], "doc": {"description": ["Create a dictionary containing all possible two-letter combinations of the lowercase English alphabets.", "The dictionary values represent the frequency of these two-letter combinations in the given word.", "If a combination does not appear in the word, its value will be 0."], "note": [], "params": ["word (str): The input string containing alphabetic characters."], "returns": ["dict: A dictionary with keys as two-letter alphabet combinations and values as their counts in the word."], "reqs": ["collections.Counter", "itertools", "string", "The function uses the `collections.Counter` library to count the occurrences of two-letter combinations.", "The function uses the `itertools.permutations` method to generate all two-letter combinations of alphabets.", "The function uses the `string` library to get a string of lowercase alphabets."], "raises": [], "example": [">>> list(f_770('abcdef').items())[:5]", "[('ab', 1), ('ac', 0), ('ad', 0), ('ae', 0), ('af', 0)]"]}} +{"task_id": "f_868", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import MinMaxScaler\n\n# Constants\nPLOT_TITLE = \"Scaled Values\"\n\n\ndef f_868(data_dict):\n \"\"\"\n Scales the values in a given dictionary using MinMaxScaler and plots the scaled data.\n\n Parameters:\n - data_dict (dict): A dictionary where keys represent column names and values are lists of numerical data.\n The values may contain missing data (None), which are handled by dropping them before scaling.\n\n Returns:\n - pandas.DataFrame containing the scaled data.\n - matplotlib Axes object that displays the plot of the scaled data.\n\n Requirements:\n - pandas\n - scikit-learn\n - matplotlib\n\n Example:\n >>> data = {'a': [1, 2, None, 4], 'b': [5, None, 7, 8]}\n >>> scaled_df, plot_ax = f_868(data)\n >>> scaled_df\n a b\n 0 0.0 0.0\n 1 1.0 1.0\n >>> plot_ax.get_title()\n 'Scaled Values'\n \"\"\"", "canonical_solution": " df = pd.DataFrame(data_dict).dropna()\n\n if df.empty:\n ax = plt.gca()\n ax.set_title(PLOT_TITLE)\n return df, ax\n\n scaler = MinMaxScaler()\n scaled_data = scaler.fit_transform(df)\n df_scaled = pd.DataFrame(scaled_data, columns=df.columns)\n\n ax = df_scaled.plot()\n ax.set_title(PLOT_TITLE)\n\n return df_scaled, ax", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n \"\"\"Unit tests for the function.\"\"\"\n def test_empty_data(self):\n \"\"\"\n Test with an empty dictionary. Should return an empty DataFrame and a plot object.\n \"\"\"\n result_df, result_ax = f_868({})\n self.assertTrue(result_df.empty)\n self.assertIsNotNone(result_ax)\n def test_all_none_data(self):\n \"\"\"\n Test with a dictionary where all values are None. Should return an empty DataFrame and a plot object.\n \"\"\"\n data = {\"a\": [None, None], \"b\": [None, None]}\n result_df, result_ax = f_868(data)\n self.assertTrue(result_df.empty)\n self.assertIsNotNone(result_ax)\n def test_normal_data(self):\n \"\"\"\n Test with a normal data dictionary. Should return a non-empty DataFrame and a plot object.\n \"\"\"\n data = {\"a\": [1, 2, 3], \"b\": [4, 5, 6]}\n result_df, result_ax = f_868(data)\n self.assertEqual(result_ax.get_title(), \"Scaled Values\")\n self.assertFalse(result_df.empty)\n self.assertEqual(result_df.shape, (3, 2))\n self.assertIsNotNone(result_ax)\n def test_with_missing_values(self):\n \"\"\"\n Test data with some missing values. Missing values should be dropped, and scaled data should be returned.\n \"\"\"\n data = {\"a\": [1, None, 3], \"b\": [4, 5, None]}\n result_df, result_ax = f_868(data)\n self.assertEqual(result_df.shape, (1, 2)) # Only one row without missing values\n self.assertIsNotNone(result_ax)\n def test_with_negative_values(self):\n \"\"\"\n Test data with negative values. Should handle negative values correctly and return scaled data.\n \"\"\"\n data = {\"a\": [-1, -2, -3], \"b\": [1, 2, 3]}\n result_df, result_ax = f_868(data)\n self.assertFalse(result_df.empty)\n self.assertEqual(result_df.shape, (3, 2))\n self.assertIsNotNone(result_ax)", "apis": ["pandas.DataFrame", "matplotlib.pyplot.gca", "sklearn.preprocessing.MinMaxScaler"], "libs": ["sklearn", "pandas", "matplotlib"], "doc": {"description": ["Scales the values in a given dictionary using MinMaxScaler and plots the scaled data."], "note": [], "params": ["data_dict (dict): A dictionary where keys represent column names and values are lists of numerical data.", "The values may contain missing data (None), which are handled by dropping them before scaling."], "returns": ["pandas.DataFrame containing the scaled data.", "matplotlib Axes object that displays the plot of the scaled data."], "reqs": ["pandas", "scikit-learn", "matplotlib"], "raises": [], "example": [">>> data = {'a': [1, 2, None, 4], 'b': [5, None, 7, 8]}", ">>> scaled_df, plot_ax = f_868(data)", ">>> scaled_df", "a b", "0 0.0 0.0", "1 1.0 1.0", ">>> plot_ax.get_title()", "'Scaled Values'"]}} +{"task_id": "f_895", "prompt": "import collections\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_895(data_dict):\n \"\"\"\n Analyze the uniformity of a distribution represented by a dictionary of categories and their counts,\n and create a description to introduce this distribution.\n\n Parameters:\n - data_dict (dict): A dictionary with categories as keys and counts as values.\n\n Returns:\n - tuple: A tuple containing:\n - matplotlib.axes._subplots.Axes: The axes object of the histogram.\n - str: A message indicating whether the distribution is uniform (\"The distribution is uniform.\")\n or not (\"The distribution is not uniform.\").\n\n Note:\n - If 'data_dict' is empty, the function returns None and a message \"The distribution is uniform.\"\n indicating that an empty distribution is considered uniform by default.\n - If 'data_dict' is not empty, it calculates the average count of the categories.\n - The distribution is considered uniform if the absolute difference between each count and the\n average count is less than or equal to 1e-5.\n - If any count's absolute difference with the average count is more than 1e-5, the distribution\n is considered not uniform.\n - The function then creates a histogram of the counts using matplotlib, with the number of bins\n being the lesser of 10 or the number of unique counts. The histogram's x-ticks are labeled with\n the category names.\n\n Requirements:\n - collections\n - numpy\n - matplotlib\n\n Example:\n >>> data = {'A': 2, 'B': 3, 'C': 4, 'D': 1, 'E': 2}\n >>> ax, message = f_895(data)\n >>> print(message)\n The distribution is not uniform.\n \"\"\"", "canonical_solution": " if not data_dict:\n return None, \"The distribution is uniform.\"\n\n data_counter = collections.Counter(data_dict)\n counts = list(data_counter.values())\n avg_count = sum(counts) / len(counts)\n uniform = all(abs(count - avg_count) <= 1e-5 for count in counts)\n message = (\n \"The distribution is uniform.\"\n if uniform\n else \"The distribution is not uniform.\"\n )\n\n _, ax = plt.subplots()\n ax.hist(\n counts,\n bins=np.linspace(min(counts), max(counts), min(10, len(counts))),\n rwidth=0.8,\n )\n ax.set_xticks(np.arange(len(data_dict)) + 1)\n ax.set_xticklabels(list(data_dict.keys()))\n return ax, message", "test": "import numpy as np\nimport matplotlib.pyplot as plt\nimport unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_895.\"\"\"\n def test_uniform_distribution(self):\n \"\"\"Test whether the function correctly identifies a uniform distribution.\"\"\"\n data = {\"A\": 5, \"B\": 5, \"C\": 5}\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is uniform.\")\n def test_non_uniform_distribution(self):\n \"\"\"Test whether the function correctly identifies a non-uniform distribution.\"\"\"\n data = {\"A\": 3, \"B\": 2, \"C\": 4}\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is not uniform.\")\n def test_empty_dictionary(self):\n \"\"\"Test the function with an empty dictionary.\"\"\"\n data = {}\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is uniform.\")\n def test_single_category(self):\n \"\"\"Test the function with a single category.\"\"\"\n data = {\"A\": 1}\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is uniform.\")\n def test_large_distribution(self):\n \"\"\"Test the function with a large number of categories.\"\"\"\n data = {chr(i): i for i in range(65, 91)} # A to Z with ascending counts\n _, message = f_895(data)\n self.assertEqual(message, \"The distribution is not uniform.\")", "apis": ["matplotlib.pyplot.subplots", "collections.Counter", "numpy.linspace", "numpy.arange"], "libs": ["collections", "numpy", "matplotlib"], "doc": {"description": ["Analyze the uniformity of a distribution represented by a dictionary of categories and their counts,", "and create a description to introduce this distribution."], "note": ["If 'data_dict' is empty, the function returns None and a message \"The distribution is uniform.\"", "indicating that an empty distribution is considered uniform by default.", "If 'data_dict' is not empty, it calculates the average count of the categories.", "The distribution is considered uniform if the absolute difference between each count and the", "average count is less than or equal to 1e-5.", "If any count's absolute difference with the average count is more than 1e-5, the distribution", "is considered not uniform.", "The function then creates a histogram of the counts using matplotlib, with the number of bins", "being the lesser of 10 or the number of unique counts. The histogram's x-ticks are labeled with", "the category names."], "params": ["data_dict (dict): A dictionary with categories as keys and counts as values."], "returns": ["tuple: A tuple containing:", "matplotlib.axes._subplots.Axes: The axes object of the histogram.", "str: A message indicating whether the distribution is uniform (\"The distribution is uniform.\")", "or not (\"The distribution is not uniform.\")."], "reqs": ["collections", "numpy", "matplotlib"], "raises": [], "example": [">>> data = {'A': 2, 'B': 3, 'C': 4, 'D': 1, 'E': 2}", ">>> ax, message = f_895(data)", ">>> print(message)", "The distribution is not uniform."]}} +{"task_id": "f_809", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n\ndef f_809(df: pd.DataFrame) -> pd.DataFrame:\n \"\"\"\n Calculate the cumulative sum for each column in a given DataFrame and plot\n the results in a bar chart.\n\n Args:\n df (pd.DataFrame): The input DataFrame with numerical values.\n Must not be empty and must contain numeric data to plot.\n Returns:\n - tuple: A tuple containing:\n (1) A DataFrame with cumulative sums for each column.\n (2) A matplotlib bar chart Figure of these cumulative sums.\n\n Raises:\n - ValueError: If the DataFrame is empty or contains non-numeric data.\n\n Requirements:\n - pandas\n - matplotlib\n\n Note:\n - NaN values are ignored in the cumulative sum calculation, i.e. treated as\n zero for the purpose of the sum without changing existing values to NaN.\n - The plot title is set to 'Cumulative Sum per Column'.\n - X-axis label is 'Index' and Y-axis label is 'Cumulative Sum'.\n - A legend is included in the plot.\n\n Example:\n >>> input_df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})\n >>> output_df, fig = f_809(input_df)\n >>> output_df\n A B\n 0 1 4\n 1 3 9\n 2 6 15\n >>> fig\n
\n \"\"\"", "canonical_solution": " cumsum_df = df.cumsum()\n\n fig, ax = plt.subplots()\n cumsum_df.plot(kind=\"bar\", ax=ax)\n ax.set_title(\"Cumulative Sum per Column\")\n ax.set_xlabel(\"Index\")\n ax.set_ylabel(\"Cumulative Sum\")\n ax.legend()\n\n return cumsum_df, fig", "test": "import numpy as np\nimport pandas as pd\nimport unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Setup common for all tests\n self.input_df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n self.expected_df = pd.DataFrame({\"A\": [1, 3, 6], \"B\": [4, 9, 15]})\n def test_case_1(self):\n # Test basic case\n output_df, _ = f_809(self.input_df)\n pd.testing.assert_frame_equal(output_df, self.expected_df)\n def test_case_2(self):\n # Test cumulative sum correctness for a case with negative values\n input_df_neg = pd.DataFrame({\"A\": [1, -2, 3], \"B\": [-4, 5, -6]})\n expected_df_neg = pd.DataFrame({\"A\": [1, -1, 2], \"B\": [-4, 1, -5]})\n output_df_neg, _ = f_809(input_df_neg)\n pd.testing.assert_frame_equal(output_df_neg, expected_df_neg)\n def test_case_3(self):\n # Test bar chart properties\n _, fig = f_809(self.input_df)\n self.assertIsInstance(fig, plt.Figure)\n ax = fig.axes[0] # Get the Axes object from the figure\n # Verify the title, x-label, and y-label\n self.assertEqual(ax.get_title(), \"Cumulative Sum per Column\")\n self.assertEqual(ax.get_xlabel(), \"Index\")\n self.assertEqual(ax.get_ylabel(), \"Cumulative Sum\")\n # Ensure that a legend is present and contains the correct labels\n legend_labels = [text.get_text() for text in ax.get_legend().get_texts()]\n expected_labels = self.input_df.columns.tolist()\n self.assertEqual(legend_labels, expected_labels)\n def test_case_4(self):\n # Test with an empty DataFrame\n empty_df = pd.DataFrame()\n with self.assertRaises(Exception):\n f_809(empty_df)\n def test_case_5(self):\n # Test with DataFrame containing NaN values\n nan_df = pd.DataFrame({\"A\": [1, np.nan, 3], \"B\": [4, 5, np.nan]})\n nan_df_cumsum = nan_df.cumsum()\n output_nan_df, _ = f_809(nan_df)\n pd.testing.assert_frame_equal(output_nan_df, nan_df_cumsum)\n def test_case_6(self):\n # Test with DataFrame containing all zeros\n zeros_df = pd.DataFrame({\"A\": [0, 0, 0], \"B\": [0, 0, 0]})\n expected_zeros_df = pd.DataFrame({\"A\": [0, 0, 0], \"B\": [0, 0, 0]})\n output_zeros_df, _ = f_809(zeros_df)\n pd.testing.assert_frame_equal(output_zeros_df, expected_zeros_df)\n def test_case_7(self):\n # Test with a DataFrame containing only one row\n one_row_df = pd.DataFrame({\"A\": [1], \"B\": [2]})\n expected_one_row_df = pd.DataFrame({\"A\": [1], \"B\": [2]})\n output_one_row_df, _ = f_809(one_row_df)\n pd.testing.assert_frame_equal(output_one_row_df, expected_one_row_df)", "apis": ["pandas.DataFrame", "matplotlib.pyplot.subplots"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Calculate the cumulative sum for each column in a given DataFrame and plot", "the results in a bar chart.", "Args:", "df (pd.DataFrame): The input DataFrame with numerical values.", "Must not be empty and must contain numeric data to plot."], "note": ["NaN values are ignored in the cumulative sum calculation, i.e. treated as", "zero for the purpose of the sum without changing existing values to NaN.", "The plot title is set to 'Cumulative Sum per Column'.", "X-axis label is 'Index' and Y-axis label is 'Cumulative Sum'.", "A legend is included in the plot."], "params": [], "returns": ["tuple: A tuple containing:", "(1) A DataFrame with cumulative sums for each column.", "(2) A matplotlib bar chart Figure of these cumulative sums."], "reqs": ["pandas", "matplotlib"], "raises": ["ValueError: If the DataFrame is empty or contains non-numeric data."], "example": [">>> input_df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})", ">>> output_df, fig = f_809(input_df)", ">>> output_df", "A B", "0 1 4", "1 3 9", "2 6 15", ">>> fig", "
"]}} +{"task_id": "f_559", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.linear_model import LinearRegression\n\nROWS = 100\nCOLUMNS = ['X', 'Y']\n\ndef f_559(df):\n \"\"\"\n Given a Pandas DataFrame with random numeric values and columns X & Y, use sklearn's linear regression to match the data to a linear model.\n\n Parameters:\n - df (DataFrame): The DataFrame to use.\n\n Returns:\n - model (LinearRegression): The fitted linear model.\n\n Requirements:\n - numpy\n - pandas\n - sklearn\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.normal(size=(100, 2)), columns=['X', 'Y'])\n >>> model = f_559(df)\n >>> print(model)\n LinearRegression()\n \"\"\"", "canonical_solution": " X = pd.DataFrame(df[['X']]) # Extracting column 'X' as a DataFrame\n y = pd.Series(df['Y']) # Extracting column 'Y' as a Series\n \n # Fitting the linear regression model\n model = LinearRegression().fit(X, y)\n \n return model", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n \n def test_case_2(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n self.assertTrue(model.coef_ is not None)\n def test_case_3(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n self.assertTrue(model.coef_ is not None)\n self.assertTrue(model.intercept_ is not None)\n def test_case_4(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n self.assertTrue(model.coef_ is not None)\n self.assertTrue(model.intercept_ is not None)\n self.assertTrue(model.score(df[['X']], df['Y']) is not None)\n def test_case_5(self):\n df = pd.DataFrame(np.random.normal(size=(ROWS, len(COLUMNS))), columns=COLUMNS)\n model = f_559(df)\n self.assertTrue(model is not None)\n self.assertTrue(model.coef_ is not None)\n self.assertTrue(model.intercept_ is not None)\n self.assertTrue(model.score(df[['X']], df['Y']) is not None)\n self.assertTrue(model.score(df[['X']], df['Y']) >= 0)", "apis": ["pandas.DataFrame", "sklearn.linear_model.LinearRegression", "pandas.Series"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Given a Pandas DataFrame with random numeric values and columns X & Y, use sklearn's linear regression to match the data to a linear model."], "note": [], "params": ["df (DataFrame): The DataFrame to use."], "returns": ["model (LinearRegression): The fitted linear model."], "reqs": ["numpy", "pandas", "sklearn"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.normal(size=(100, 2)), columns=['X', 'Y'])", ">>> model = f_559(df)", ">>> print(model)", "LinearRegression()"]}} +{"task_id": "f_843", "prompt": "import urllib.request\nimport os\nimport json\nimport pandas as pd\n\n# Constants\nTARGET_JSON_FILE = \"downloaded_file.json\"\n\n\ndef f_843(url):\n \"\"\"\n This function retrieves a JSON file from the given URL using urllib.request.urlretrieve,\n temporarily saving it as 'downloaded_file.json'. It then opens and reads this file,\n converts the JSON content into a pandas DataFrame, and finally deletes the temporary JSON file.\n\n Parameters:\n url (str): The URL of the JSON file to be downloaded.\n\n Returns:\n pandas.DataFrame: A DataFrame constructed from the JSON data in the downloaded file.\n\n Requirements:\n - urllib.request\n - os\n - json\n - pandas\n\n Example:\n >>> f_843('http://example.com/employees.json')\n name age city\n 0 Alice 25 New York\n 1 Bob 30 San Francisco\n \"\"\"", "canonical_solution": " urllib.request.urlretrieve(url, TARGET_JSON_FILE)\n\n with open(TARGET_JSON_FILE, \"r\") as f:\n data = json.load(f)\n\n os.remove(TARGET_JSON_FILE)\n\n return pd.DataFrame(data)", "test": "import unittest\nimport pandas as pd\nfrom unittest.mock import patch, mock_open\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_843 function.\"\"\"\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"os.remove\")\n def test_sample_1(self, mock_remove, mock_urlretrieve):\n \"\"\"Test that the function returns the correct DataFrame for a given JSON file.\"\"\"\n url = \"http://example.com/sample_1.json\"\n sample_data = '[{\"name\": \"Alice\", \"age\": 25, \"city\": \"New York\"}, {\"name\": \"Bob\", \"age\": 30, \"city\": \"San Francisco\"}]'\n mock_urlretrieve.return_value = None\n with patch(\"builtins.open\", mock_open(read_data=sample_data)):\n expected_df = pd.DataFrame(\n [\n {\"name\": \"Alice\", \"age\": 25, \"city\": \"New York\"},\n {\"name\": \"Bob\", \"age\": 30, \"city\": \"San Francisco\"},\n ]\n )\n result_df = f_843(url)\n pd.testing.assert_frame_equal(result_df, expected_df)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")\n mock_remove.assert_called_once_with(\"downloaded_file.json\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"os.remove\")\n def test_sample_2(self, mock_remove, mock_urlretrieve):\n \"\"\"Test that the function returns the correct DataFrame for a given JSON file.\"\"\"\n url = \"http://example.com/sample_2.json\"\n sample_data = '[{\"product\": \"Laptop\", \"price\": 1000}, {\"product\": \"Mouse\", \"price\": 20}, {\"product\": \"Keyboard\", \"price\": 50}]'\n mock_urlretrieve.return_value = None\n with patch(\"builtins.open\", mock_open(read_data=sample_data)):\n expected_df = pd.DataFrame(\n [\n {\"product\": \"Laptop\", \"price\": 1000},\n {\"product\": \"Mouse\", \"price\": 20},\n {\"product\": \"Keyboard\", \"price\": 50},\n ]\n )\n result_df = f_843(url)\n pd.testing.assert_frame_equal(result_df, expected_df)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")\n mock_remove.assert_called_once_with(\"downloaded_file.json\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"os.remove\")\n def test_empty_json(self, mock_remove, mock_urlretrieve):\n \"\"\"Test that the function returns an empty DataFrame for an empty JSON file.\"\"\"\n url = \"http://example.com/empty.json\"\n sample_data = \"[]\"\n mock_urlretrieve.return_value = None\n with patch(\"builtins.open\", mock_open(read_data=sample_data)):\n expected_df = pd.DataFrame()\n result_df = f_843(url)\n pd.testing.assert_frame_equal(result_df, expected_df)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")\n @patch(\"urllib.request.urlretrieve\")\n def test_invalid_url(self, mock_urlretrieve):\n \"\"\"Test that the function raises an exception when the URL is invalid.\"\"\"\n url = \"http://example.com/non_existent.json\"\n mock_urlretrieve.side_effect = Exception(\"URL retrieval failed\")\n with self.assertRaises(Exception):\n f_843(url)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")\n @patch(\"urllib.request.urlretrieve\")\n @patch(\"os.remove\")\n def test_invalid_json(self, mock_remove, mock_urlretrieve):\n \"\"\"Test that the function raises an exception when the JSON file is invalid.\"\"\"\n url = \"http://example.com/invalid.json\"\n sample_data = \"invalid json content\"\n mock_urlretrieve.return_value = None\n with patch(\n \"builtins.open\", mock_open(read_data=sample_data)\n ), self.assertRaises(Exception):\n f_843(url)\n mock_urlretrieve.assert_called_once_with(url, \"downloaded_file.json\")", "apis": ["urllib.request", "pandas.DataFrame", "os.remove", "json.load", "urllib.request.urlretrieve"], "libs": ["os", "urllib", "pandas", "json"], "doc": {"description": ["This function retrieves a JSON file from the given URL using urllib.request.urlretrieve,", "temporarily saving it as 'downloaded_file.json'. It then opens and reads this file,", "converts the JSON content into a pandas DataFrame, and finally deletes the temporary JSON file."], "note": [], "params": ["url (str): The URL of the JSON file to be downloaded."], "returns": ["pandas.DataFrame: A DataFrame constructed from the JSON data in the downloaded file."], "reqs": ["urllib.request", "os", "json", "pandas"], "raises": [], "example": [">>> f_843('http://example.com/employees.json')", "name age city", "0 Alice 25 New York", "1 Bob 30 San Francisco"]}} +{"task_id": "f_852", "prompt": "import xml.etree.ElementTree as ET\nimport csv\n\n\ndef f_852(xml_content, output_csv_path):\n \"\"\"\n Parses XML content from a string and converts it into a CSV format.\n\n Parameters:\n - xml_content (str): A string containing the XML content to be parsed. It should\n be well-formed XML.\n - output_csv_path (str): The file path where the resulting CSV file will be saved.\n This path must be valid and accessible for writing.\n\n Returns:\n - None: The function does not return any value. Instead, it writes the output to\n a CSV file at the specified path.\n\n Raises:\n - ET.ParseError: This exception is raised if the input XML content is malformed or\n cannot be successfully parsed. The exception message includes\n details about the parsing error.\n - IOError: Raised if there is an issue with writing to the specified CSV file path.\n This can happen due to reasons like invalid file path, full disk space,\n lack of write permissions, etc. The exception message provides details\n about the IO error.\n\n\n Requirements:\n - xml\n - csv\n\n Example:\n >>> f_852('data', 'path/to/output.csv')\n >>> with open('path/to/output.csv', 'r') as f:\n ... print(f.read())\n element,data\n\n Note:\n - Ensure that the XML content passed to the function is well-formed.\n - The output CSV path should be a valid file path where the user has write\n permissions, to prevent IOError.\n \"\"\"", "canonical_solution": " try:\n root = ET.fromstring(xml_content)\n data = [[elem.tag, elem.text] for elem in root.iter()]\n\n with open(output_csv_path, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n writer = csv.writer(f)\n writer.writerows(data)\n except ET.ParseError as e:\n raise ET.ParseError(f\"Error parsing XML: {e}\") from e\n except IOError as e:\n raise IOError(f\"Error writing CSV file: {e}\") from e", "test": "import unittest\nimport xml.etree.ElementTree as ET\nimport csv\nimport shutil\nfrom pathlib import Path\nimport os\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_852.\"\"\"\n test_data_dir = \"mnt/data/f_852_data_chien\"\n @classmethod\n def setUpClass(cls):\n \"\"\"Set up method to create a directory for test files.\"\"\"\n cls.test_dir = Path(cls.test_data_dir)\n cls.test_dir.mkdir(parents=True, exist_ok=True)\n def check_csv_content(self, xml_content, csv_path):\n \"\"\"Helper function to check if the CSV content matches the XML content.\"\"\"\n root = ET.fromstring(xml_content)\n expected_data = [\n [elem.tag, elem.text if elem.text is not None else \"\"]\n for elem in root.iter()\n ]\n with open(csv_path, \"r\", encoding=\"utf-8\") as file:\n reader = csv.reader(file)\n csv_data = list(reader)\n self.assertEqual(expected_data, csv_data)\n def test_simple_xml(self):\n \"\"\"Test with simple XML content.\"\"\"\n xml_content = \"data\"\n csv_output = self.test_dir / \"output_scenario_0.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_nested_xml(self):\n \"\"\"Test with nested XML content.\"\"\"\n xml_content = \"data\"\n csv_output = self.test_dir / \"output_scenario_1.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_empty_xml(self):\n \"\"\"Test with an empty XML.\"\"\"\n xml_content = \"\"\n csv_output = self.test_dir / \"output_scenario_2.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_xml_with_attributes(self):\n \"\"\"Test with an XML that contains elements with attributes.\"\"\"\n xml_content = 'data'\n csv_output = self.test_dir / \"output_scenario_3.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_large_xml(self):\n \"\"\"Test with a larger XML file.\"\"\"\n xml_content = (\n \"\"\n + \"\".join([f\"{i}\" for i in range(100)])\n + \"\"\n )\n csv_output = self.test_dir / \"output_scenario_4.csv\"\n f_852(xml_content, csv_output)\n self.check_csv_content(xml_content, csv_output)\n def test_invalid_xml_content(self):\n \"\"\"Test with invalid XML content to trigger ET.ParseError.\"\"\"\n xml_content = \"datadata\"\n csv_output = self.test_dir / \"non_existent_directory\" / \"output.csv\"\n with self.assertRaises(IOError):\n f_852(xml_content, csv_output)\n @classmethod\n def tearDownClass(cls):\n # Cleanup the test directories\n dirs_to_remove = [\"mnt/data\", \"mnt\"]\n for dir_path in dirs_to_remove:\n if os.path.exists(dir_path):\n shutil.rmtree(dir_path)", "apis": ["csv.writer", "xml.etree.ElementTree.fromstring", "xml.etree.ElementTree.ParseError"], "libs": ["xml", "csv"], "doc": {"description": ["Parses XML content from a string and converts it into a CSV format."], "note": ["Ensure that the XML content passed to the function is well-formed.", "The output CSV path should be a valid file path where the user has write", "permissions, to prevent IOError."], "params": ["xml_content (str): A string containing the XML content to be parsed. It should", "be well-formed XML.", "output_csv_path (str): The file path where the resulting CSV file will be saved.", "This path must be valid and accessible for writing."], "returns": ["None: The function does not return any value. Instead, it writes the output to", "a CSV file at the specified path."], "reqs": ["xml", "csv"], "raises": ["ET.ParseError: This exception is raised if the input XML content is malformed or", "cannot be successfully parsed. The exception message includes", "details about the parsing error.", "IOError: Raised if there is an issue with writing to the specified CSV file path.", "This can happen due to reasons like invalid file path, full disk space,", "lack of write permissions, etc. The exception message provides details", "about the IO error."], "example": [">>> f_852('data', 'path/to/output.csv')", ">>> with open('path/to/output.csv', 'r') as f:", "... print(f.read())", "element,data"]}} +{"task_id": "f_930", "prompt": "import string\nimport random\nimport pandas as pd\nimport numpy as np\n\n# Constants\nNUM_SAMPLES = 1000 # Number of samples\n\n\ndef f_930():\n \"\"\"\n Generates a DataFrame with two columns: a string field and a float field.\n The string field contains randomly generated strings of 10 ASCII letters.\n The float field contains randomly generated numbers between 0 and 10000,\n formatted with two decimal places and a comma as the thousands separator.\n\n Parameters:\n - None\n\n Returns:\n DataFrame: A pandas DataFrame with NUM_SAMPLES rows. Each row contains a\n random string in the 'String Field' column and a formatted float in the\n 'Float Field' column.\n\n Requirements:\n - string\n - random\n - pandas\n - numpy\n\n Example:\n >>> random.seed(0)\n >>> np.random.seed(0)\n >>> dataset = f_930()\n >>> print(dataset.head(1))\n String Field Float Field\n 0 RNvnAvOpyE 5,488.14\n\n Note: The exact values in the dataset will vary as they are randomly generated.\n \"\"\"", "canonical_solution": " data = {\n \"String Field\": [\n \"\".join(random.choices(string.ascii_letters, k=10))\n for _ in range(NUM_SAMPLES)\n ],\n \"Float Field\": [f\"{x:,.2f}\" for x in np.random.uniform(0, 10000, NUM_SAMPLES)],\n }\n\n df = pd.DataFrame(data)\n\n return df", "test": "import unittest\nimport pandas as pd\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_930.\"\"\"\n def test_dataframe_creation(self):\n \"\"\"\n Test if the function returns a pandas DataFrame.\n \"\"\"\n random.seed(1)\n result = f_930()\n self.assertIsInstance(result, pd.DataFrame)\n def test_row_count(self):\n \"\"\"\n Test if the DataFrame contains the correct number of rows.\n \"\"\"\n random.seed(2)\n result = f_930()\n self.assertEqual(len(result), NUM_SAMPLES)\n def test_column_count(self):\n \"\"\"\n Test if the DataFrame contains exactly two columns.\n \"\"\"\n random.seed(3)\n result = f_930()\n self.assertEqual(len(result.columns), 2)\n def test_string_field_format(self):\n \"\"\"\n Test if the 'String Field' contains strings of 10 ASCII letters.\n \"\"\"\n random.seed(4)\n result = f_930()\n all_strings = all(result[\"String Field\"].str.match(\"^[A-Za-z]{10}$\"))\n self.assertTrue(all_strings)\n def test_float_field_format(self):\n \"\"\"\n Test if the 'Float Field' contains formatted float strings.\n \"\"\"\n random.seed(5)\n result = f_930()\n all_floats = all(\n isinstance(float(val.replace(\",\", \"\")), float)\n for val in result[\"Float Field\"]\n )\n self.assertTrue(all_floats)", "apis": ["string.ascii_letters", "numpy.random", "numpy.random.uniform", "pandas.DataFrame", "random.choices"], "libs": ["random", "pandas", "numpy", "string"], "doc": {"description": ["Generates a DataFrame with two columns: a string field and a float field.", "The string field contains randomly generated strings of 10 ASCII letters.", "The float field contains randomly generated numbers between 0 and 10000,", "formatted with two decimal places and a comma as the thousands separator."], "note": ["The exact values in the dataset will vary as they are randomly generated."], "params": ["None"], "returns": ["DataFrame: A pandas DataFrame with NUM_SAMPLES rows. Each row contains a", "random string in the 'String Field' column and a formatted float in the", "'Float Field' column."], "reqs": ["string", "random", "pandas", "numpy"], "raises": [], "example": [">>> random.seed(0)", ">>> np.random.seed(0)", ">>> dataset = f_930()", ">>> print(dataset.head(1))", "String Field Float Field", "0 RNvnAvOpyE 5,488.14"]}} +{"task_id": "f_758", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\ndef f_758(df: pd.DataFrame) -> tuple:\n \"\"\"\n Visualize the distribution of stock closing prices using both a box plot and a histogram\n within a single figure. This function is designed to help understand the spread, central tendency,\n and the distribution shape of stock closing prices.\n\n Note:\n The tile of the box plot is set to 'Box Plot of Closing Prices' and the title of the histogram is set to 'Histogram of Closing Prices'.\n \n Requirements:\n - pandas\n - matplotlib.pyplot\n - seaborn\n\n Parameters:\n df (DataFrame): A pandas DataFrame containing at least one column named 'closing_price'\n with stock closing prices.\n\n Returns:\n tuple: A tuple containing two matplotlib.axes._subplots.Axes objects: the first for the boxplot\n and the second for the histogram.\n\n Example:\n >>> df = pd.DataFrame({\n ... 'closing_price': [100, 101, 102, 103, 104, 150]\n ... })\n >>> boxplot_ax, histplot_ax = f_758(df)\n >>> print(boxplot_ax.get_title())\n Box Plot of Closing Prices\n >>> print(histplot_ax.get_title())\n Histogram of Closing Prices\n \"\"\"", "canonical_solution": " fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n \n boxplot_ax = sns.boxplot(x=df['closing_price'], ax=axes[0])\n boxplot_ax.set_title('Box Plot of Closing Prices')\n \n histplot_ax = sns.histplot(df['closing_price'], kde=True, ax=axes[1])\n histplot_ax.set_title('Histogram of Closing Prices')\n \n plt.tight_layout()\n plt.close(fig) # Prevent automatic figure display within Jupyter notebooks or interactive environments.\n \n return boxplot_ax, histplot_ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n# Assuming the function f_758 is defined in the same script, otherwise import it appropriately.\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n df = pd.DataFrame({\n 'closing_price': [100, 101, 102, 103, 104, 150]\n })\n boxplot_ax, histplot_ax = f_758(df)\n \n self.assertIsInstance(boxplot_ax, plt.Axes)\n self.assertIsInstance(histplot_ax, plt.Axes)\n \n self.assertEqual(boxplot_ax.get_title(), 'Box Plot of Closing Prices')\n self.assertEqual(histplot_ax.get_title(), 'Histogram of Closing Prices')\n \n self.assertEqual(histplot_ax.get_xlabel(), 'closing_price')\n self.assertIn('Count', histplot_ax.get_ylabel()) # Check if 'Count' is part of the ylabel\n \n def test_empty_df(self):\n df = pd.DataFrame({'closing_price': []})\n boxplot_ax, histplot_ax = f_758(df)\n \n self.assertIsInstance(boxplot_ax, plt.Axes)\n self.assertIsInstance(histplot_ax, plt.Axes)\n # Instead of checking if the plot \"has data,\" we ensure that it exists and does not raise an error.\n self.assertIsNotNone(boxplot_ax, \"Boxplot should be created even with empty data.\")\n self.assertIsNotNone(histplot_ax, \"Histogram should be created even with empty data.\")\n def test_invalid_column(self):\n df = pd.DataFrame({'price': [100, 101, 102]})\n with self.assertRaises(KeyError):\n f_758(df)\n def test_single_value_df(self):\n df = pd.DataFrame({'closing_price': [100]})\n boxplot_ax, histplot_ax = f_758(df)\n \n self.assertIsInstance(boxplot_ax, plt.Axes)\n self.assertIsInstance(histplot_ax, plt.Axes)\n self.assertTrue(boxplot_ax.has_data(), \"Boxplot should handle a single value dataframe.\")\n self.assertTrue(histplot_ax.has_data(), \"Histogram should handle a single value dataframe.\")\n def test_large_values_df(self):\n df = pd.DataFrame({'closing_price': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]})\n boxplot_ax, histplot_ax = f_758(df)\n \n self.assertIsInstance(boxplot_ax, plt.Axes)\n self.assertIsInstance(histplot_ax, plt.Axes)\n self.assertTrue(boxplot_ax.has_data(), \"Boxplot should handle large values.\")\n self.assertTrue(histplot_ax.has_data(), \"Histogram should handle large values.\")", "apis": ["seaborn.boxplot", "matplotlib.pyplot.tight_layout", "pandas.DataFrame", "matplotlib.pyplot.close", "matplotlib.pyplot.subplots", "seaborn.histplot"], "libs": ["seaborn", "pandas", "matplotlib"], "doc": {"description": ["Visualize the distribution of stock closing prices using both a box plot and a histogram", "within a single figure. This function is designed to help understand the spread, central tendency,", "and the distribution shape of stock closing prices."], "note": ["The tile of the box plot is set to 'Box Plot of Closing Prices' and the title of the histogram is set to 'Histogram of Closing Prices'."], "params": ["df (DataFrame): A pandas DataFrame containing at least one column named 'closing_price'", "with stock closing prices."], "returns": ["tuple: A tuple containing two matplotlib.axes._subplots.Axes objects: the first for the boxplot", "and the second for the histogram."], "reqs": ["pandas", "matplotlib.pyplot", "seaborn"], "raises": [], "example": [">>> df = pd.DataFrame({", "... 'closing_price': [100, 101, 102, 103, 104, 150]", "... })", ">>> boxplot_ax, histplot_ax = f_758(df)", ">>> print(boxplot_ax.get_title())", "Box Plot of Closing Prices", ">>> print(histplot_ax.get_title())", "Histogram of Closing Prices"]}} +{"task_id": "f_737", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\n\n# Constants\nARRAY_SIZE = 10000\n\ndef f_737():\n \"\"\"\n Create a numeric array of random integers, calculate the mean and standard deviation, and draw a histogram of the distribution.\n\n Returns:\n Tuple: A tuple containing the array, mean, standard deviation, and the histogram plot (Axes).\n\n Note:\n The random integers are generated between 1 and 100. The title of the histogram is \"Histogram of Random Values\". \n The x-axis is labeled \"Val\" and the y-axis is labeled \"Freq\". \n The mean is plotted as a red dashed line, and the standard deviation is plotted as purple dashed lines.\n \n Requirements:\n - numpy\n - matplotlib.pyplot\n \n Example:\n >>> import numpy as np\n >>> np.random.seed(0)\n >>> array, mean, std, ax = f_737()\n >>> print(mean, std)\n 250.7154 142.85617453522966\n >>> plt.show()\n \"\"\"", "canonical_solution": " array = np.random.randint(1, 500, size=ARRAY_SIZE)\n mean = np.mean(array)\n std = np.std(array)\n\n fig, ax = plt.subplots()\n ax.hist(array, bins='auto')\n ax.set_title('Histogram of Random Values')\n ax.set_xlabel('Val')\n ax.set_ylabel('Freq')\n return array, mean, std, ax", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n np.random.seed(0)\n array, mean, std, ax = f_737()\n self.assertEqual(array.size, ARRAY_SIZE)\n self.assertEqual(mean, 250.7154)\n self.assertEqual(std, 142.85617453522966)\n self.assertEqual(ax.get_title(), 'Histogram of Random Values')\n def test_case_2(self):\n array, mean, std, ax = f_737()\n self.assertEqual(ax.get_xlabel(), 'Val')\n self.assertEqual(ax.get_ylabel(), 'Freq')\n def test_case_3(self):\n np.random.seed(42)\n array, mean, std, ax = f_737()\n self.assertEqual(array[0], 103)\n self.assertEqual(array[-1], 474)\n self.assertEqual(mean, 250.171)\n self.assertEqual(std, 144.01374920124815)\n \n def test_case_4(self):\n np.random.seed(142)\n array, mean, std, ax = f_737()\n self.assertEqual(array[0], 278)\n self.assertEqual(array[-1], 113)\n self.assertEqual(mean, 251.1245)\n self.assertEqual(std, 144.49066405740547)\n def test_case_5(self):\n np.random.seed(250)\n array, mean, std, ax = f_737()\n self.assertEqual(array[0], 367)\n self.assertEqual(array[-1], 190)\n self.assertEqual(mean, 249.037)\n self.assertEqual(std, 144.32681882103546)", "apis": ["numpy.random.randint", "numpy.random", "numpy.mean", "matplotlib.pyplot.subplots", "numpy.std"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Create a numeric array of random integers, calculate the mean and standard deviation, and draw a histogram of the distribution."], "note": ["The random integers are generated between 1 and 100. The title of the histogram is \"Histogram of Random Values\".", "The x-axis is labeled \"Val\" and the y-axis is labeled \"Freq\".", "The mean is plotted as a red dashed line, and the standard deviation is plotted as purple dashed lines."], "params": [], "returns": ["Tuple: A tuple containing the array, mean, standard deviation, and the histogram plot (Axes)."], "reqs": ["numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> import numpy as np", ">>> np.random.seed(0)", ">>> array, mean, std, ax = f_737()", ">>> print(mean, std)", "250.7154 142.85617453522966", ">>> plt.show()"]}} +{"task_id": "f_364", "prompt": "import pandas as pd\nimport random\nimport matplotlib.pyplot as plt\n\n\ndef f_364(num_rows=100, categories=[\"a\", \"b\", \"c\", \"d\", \"e\"], random_seed=42):\n \"\"\"\n Create a Pandas DataFrame with specified number of rows. Each row contains a randomly\n selected category from the provided categories list and a random integer between 1 and 100.\n\n The function also generates a bar chart visualizing the counts of each category in the DataFrame\n and returns both the DataFrame and the bar chart.\n\n Parameters:\n - num_rows (int): Number of rows in the DataFrame. Default is 100. Must be at least 1.\n - categories (list): List of categories to choose from. Default is ['a', 'b', 'c', 'd', 'e'].\n - random_seed (int): Seed for random number generation to ensure reproducibility. Default is 42.\n\n Returns:\n - pd.DataFrame: A pandas DataFrame with randomly generated category data.\n - plt.Axes: A bar chart visualizing the category counts.\n\n Requirements:\n - pandas\n - random\n\n Example:\n >>> df, ax = f_364(num_rows=5)\n >>> df\n Category Value\n 0 a 18\n 1 a 95\n 2 c 14\n 3 b 87\n 4 b 95\n \"\"\"", "canonical_solution": " if num_rows <= 0:\n raise ValueError(\"num_rows must not be negative\")\n\n random.seed(random_seed)\n\n df = pd.DataFrame(\n {\n \"Category\": [\n categories[random.randint(0, len(categories) - 1)]\n for _ in range(num_rows)\n ],\n \"Value\": [random.randint(1, 100) for _ in range(num_rows)],\n }\n )\n\n ax = (\n df[\"Category\"]\n .value_counts()\n .plot(kind=\"bar\", title=\"Category Counts\", figsize=(10, 6))\n )\n\n return df, ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test with default parameters\n df, ax = f_364()\n self.assertEqual(len(df), 100)\n self.assertTrue(\n set(df[\"Category\"].unique()).issubset(set([\"a\", \"b\", \"c\", \"d\", \"e\"]))\n )\n self.assertTrue(df[\"Value\"].min() >= 1)\n self.assertTrue(df[\"Value\"].max() <= 100)\n self.assertEqual(ax.get_title(), \"Category Counts\")\n def test_case_2(self):\n # Test num_rows\n for num_rows in [10, 50, 100]:\n df, _ = f_364(num_rows=num_rows)\n self.assertEqual(len(df), num_rows)\n def test_case_3(self):\n # Test edge case - 0 rows\n with self.assertRaises(Exception):\n f_364(num_rows=0)\n def test_case_4(self):\n # Test edge case - invalid num_rows\n with self.assertRaises(Exception):\n f_364(num_rows=-1)\n def test_case_5(self):\n # Test categories\n df, _ = f_364(categories=[\"x\", \"y\", \"z\"])\n self.assertTrue(set(df[\"Category\"].unique()).issubset(set([\"x\", \"y\", \"z\"])))\n def test_case_6(self):\n # Test edge case - single category\n df, _ = f_364(categories=[\"unique\"])\n self.assertTrue(\n set([\"unique\"]).issubset(df[\"Category\"].unique()),\n \"Should work with a single category\",\n )\n def test_case_7(self):\n # Test edge case - empty categories\n with self.assertRaises(Exception):\n f_364(categories=[])\n def test_case_8(self):\n # Test random seed\n df1, _ = f_364(random_seed=123)\n df2, _ = f_364(random_seed=123)\n df3, _ = f_364(random_seed=124)\n self.assertTrue(\n df1.equals(df2), \"DataFrames should be identical with the same seed\"\n )\n self.assertFalse(\n df1.equals(df3), \"DataFrames should differ with different seeds\"\n )\n def test_case_9(self):\n # Test visualization\n categories = [\"x\", \"y\", \"z\"]\n _, ax = f_364(num_rows=100, categories=categories, random_seed=42)\n ax_categories = [tick.get_text() for tick in ax.get_xticklabels()]\n self.assertListEqual(\n sorted(categories),\n sorted(ax_categories),\n \"X-axis categories should match input categories\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["pandas.DataFrame", "random.randint", "random.seed"], "libs": ["random", "pandas"], "doc": {"description": ["Create a Pandas DataFrame with specified number of rows. Each row contains a randomly", "selected category from the provided categories list and a random integer between 1 and 100.", "The function also generates a bar chart visualizing the counts of each category in the DataFrame", "and returns both the DataFrame and the bar chart."], "note": [], "params": ["num_rows (int): Number of rows in the DataFrame. Default is 100. Must be at least 1.", "categories (list): List of categories to choose from. Default is ['a', 'b', 'c', 'd', 'e'].", "random_seed (int): Seed for random number generation to ensure reproducibility. Default is 42."], "returns": ["pd.DataFrame: A pandas DataFrame with randomly generated category data.", "plt.Axes: A bar chart visualizing the category counts."], "reqs": ["pandas", "random"], "raises": [], "example": [">>> df, ax = f_364(num_rows=5)", ">>> df", "Category Value", "0 a 18", "1 a 95", "2 c 14", "3 b 87", "4 b 95"]}} +{"task_id": "f_896", "prompt": "import pandas as pd\nfrom sklearn.feature_extraction.text import CountVectorizer\nimport matplotlib.pyplot as plt\n\n# Constants\nSTOP_WORDS = [\"a\", \"an\", \"the\", \"in\", \"on\", \"at\", \"and\", \"or\"]\n\n\ndef f_896(file_path, save_path=None):\n \"\"\"\n This function processes a text dataset from a CSV file, performs text vectorization while excluding specific\n stopwords, and creates a histogram of the ten most common words. The function is robust to different input\n scenarios, such as empty data or data containing only stopwords.\n\n Parameters:\n - file_path (str): Path to the CSV file containing the text data. The CSV should have a single text column.\n - save_path (str, optional): Path where the histogram plot will be saved. If not provided, the plot is displayed.\n\n Returns:\n - matplotlib Axes object: If save_path is not provided and valid words are found in the input, the function\n displays the histogram plot and returns the matplotlib Axes object.\n - None: In two scenarios:\n 1. If save_path is provided, saves the plot to the specified location and returns None.\n 2. If the input file is empty or contains only stop words, prints a message and returns None.\n\n Requirements:\n - pandas\n - scikit-learn\n - matplotlib\n\n Examples:\n >>> ax = f_896('text_data.csv')\n # ax is the matplotlib Axes object for the plot\n >>> result = f_896('text_data.csv', 'output_plot.png')\n # result is None, and the plot is saved to 'output_plot.png'\n \"\"\"", "canonical_solution": " df = pd.read_csv(file_path, header=None, names=[\"Text\"])\n df[\"Text\"] = df[\"Text\"].str.split(\"\\\\n\").str.join(\" \")\n\n vectorizer = CountVectorizer(stop_words=STOP_WORDS)\n try:\n word_count = vectorizer.fit_transform(df[\"Text\"])\n except ValueError:\n # Handle the case where the DataFrame is empty or contains only stop words\n print(\"No valid words to plot. Returning None.\")\n return None\n\n sum_words = word_count.sum(axis=0)\n words_freq = [\n (word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()\n ]\n words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)\n\n top_words = words_freq[:10]\n df = pd.DataFrame(top_words, columns=[\"Word\", \"Count\"])\n\n ax = df.plot.bar(x=\"Word\", y=\"Count\", rot=0)\n\n # Saving or displaying the plot\n if save_path:\n plt.savefig(save_path)\n plt.close()\n return None\n else:\n return ax", "test": "import unittest\nfrom unittest.mock import patch\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_896\"\"\"\n @patch(\"pandas.read_csv\")\n def test_empty_csv(self, mock_read_csv):\n \"\"\"\n Test with an empty CSV file. Checks if the function handles empty data gracefully.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame(columns=[\"Text\"])\n result = f_896(\"dummy_path.csv\")\n self.assertIsNone(result, \"The function should return None for empty data\")\n @patch(\"pandas.read_csv\")\n def test_single_line_csv(self, mock_read_csv):\n \"\"\"\n Test with a CSV file containing a single line of text. Verifies correct handling of minimal data.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"Text\": [\"test\"]})\n ax = f_896(\"dummy_path.csv\")\n self.assertEqual(\n len(ax.patches),\n 1,\n \"There should be one bar in the histogram for a single word\",\n )\n @patch(\"pandas.read_csv\")\n def test_stop_words_removal(self, mock_read_csv):\n \"\"\"\n Test to ensure that stop words are correctly removed from the text.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"Text\": [\"a test\"]})\n ax = f_896(\"dummy_path.csv\")\n x_labels = [label.get_text() for label in ax.get_xticklabels()]\n self.assertNotIn(\"a\", x_labels, \"Stop words should not appear in the histogram\")\n @patch(\"pandas.read_csv\")\n @patch(\"matplotlib.pyplot.savefig\")\n def test_save_plot(self, mock_savefig, mock_read_csv):\n \"\"\"\n Test the functionality of saving the plot to a file.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"Text\": [\"save test\"]})\n f_896(\"dummy_path.csv\", \"output.png\")\n mock_savefig.assert_called_with(\"output.png\")\n @patch(\"pandas.read_csv\")\n def test_multiple_lines_csv(self, mock_read_csv):\n \"\"\"\n Test with a CSV file containing multiple lines of text. Checks for correct handling of multiline data.\n \"\"\"\n mock_read_csv.return_value = pd.DataFrame({\"Text\": [\"test1\", \"test2\"]})\n ax = f_896(\"dummy_path.csv\")\n self.assertEqual(\n len(ax.patches),\n 2,\n \"There should be two bars in the histogram for two different words\",\n )\n def tearDown(self):\n plt.close()", "apis": ["pandas.read_csv", "sklearn.feature_extraction.text.CountVectorizer", "pandas.DataFrame", "matplotlib.pyplot.savefig", "matplotlib.pyplot.close"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["This function processes a text dataset from a CSV file, performs text vectorization while excluding specific", "stopwords, and creates a histogram of the ten most common words. The function is robust to different input", "scenarios, such as empty data or data containing only stopwords."], "note": [], "params": ["file_path (str): Path to the CSV file containing the text data. The CSV should have a single text column.", "save_path (str, optional): Path where the histogram plot will be saved. If not provided, the plot is displayed."], "returns": ["matplotlib Axes object: If save_path is not provided and valid words are found in the input, the function", "displays the histogram plot and returns the matplotlib Axes object.", "None: In two scenarios:", "1. If save_path is provided, saves the plot to the specified location and returns None.", "2. If the input file is empty or contains only stop words, prints a message and returns None."], "reqs": ["pandas", "scikit-learn", "matplotlib"], "raises": [], "example": ["Examples:", ">>> ax = f_896('text_data.csv')", "# ax is the matplotlib Axes object for the plot", ">>> result = f_896('text_data.csv', 'output_plot.png')", "# result is None, and the plot is saved to 'output_plot.png'"]}} +{"task_id": "f_369", "prompt": "import matplotlib.pyplot as plt\nimport numpy as np\n\n\ndef f_369(myList):\n \"\"\"\n Draws a histogram of the values in a list and returns the plot's Axes.\n\n For visualization:\n - Bin edges are adjusted to align with integer values in `myList`.\n - Histogram bars are outlined in black.\n - X-axis label: 'Value'\n - Y-axis label: 'Frequency'\n - Plot title: 'Histogram of Values'\n\n Parameters:\n - myList (list): List of numerical values to plot.\n\n Returns:\n - ax (matplotlib.axes._axes.Axes): Axes object of the histogram plot.\n\n Requirements:\n - matplotlib.pyplot\n - numpy\n\n Example:\n >>> myList = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]\n >>> ax = f_369(myList)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(0.0, 0, '0.0'), Text(0.5, 0, '0.5'), Text(1.0, 0, '1.0'), Text(1.5, 0, '1.5'), Text(2.0, 0, '2.0'), Text(2.5, 0, '2.5'), Text(3.0, 0, '3.0'), Text(3.5, 0, '3.5'), Text(4.0, 0, '4.0'), Text(4.5, 0, '4.5'), Text(5.0, 0, '5.0')]\n \"\"\"", "canonical_solution": " _, ax = plt.subplots()\n ax.hist(\n myList, bins=np.arange(min(myList), max(myList) + 2) - 0.5, edgecolor=\"black\"\n )\n ax.set_xlabel(\"Value\")\n ax.set_ylabel(\"Frequency\")\n ax.set_title(\"Histogram of Values\")\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n myList = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]\n ax = f_369(myList)\n heights, _, _ = ax.hist(\n myList,\n bins=np.arange(min(myList), max(myList) + 2) - 0.5,\n edgecolor=\"black\",\n )\n self.assertIsInstance(ax, plt.Axes)\n self.assertListEqual(list(heights), [1, 2, 3, 4])\n self.assertEqual(ax.get_title(), \"Histogram of Values\")\n self.assertEqual(ax.get_xlabel(), \"Value\")\n self.assertEqual(ax.get_ylabel(), \"Frequency\")\n def test_case_2(self):\n # Test with empty list\n with self.assertRaises(ValueError):\n f_369([])\n def test_case_3(self):\n # Test with single element\n myList = [100]\n ax = f_369(myList)\n heights, _, _ = ax.hist(myList)\n self.assertEqual(heights.max(), 1)\n def test_case_4(self):\n # Test with negative values\n myList = [-5, -4, -3, -3, -2, -2, -2, -1]\n ax = f_369(myList)\n heights, _, _ = ax.hist(myList)\n self.assertGreaterEqual(len(heights), 1)\n def test_case_5(self):\n # Test with floats\n myList = [1.1, 1.2, 2.5, 2.5, 3.75, 4.25]\n ax = f_369(myList)\n heights, _, _ = ax.hist(myList)\n self.assertGreaterEqual(len(heights), 1)\n def test_case_6(self):\n # Test handling non-numeric values\n myList = [\"a\", \"b\", \"c\"]\n with self.assertRaises(TypeError):\n f_369(myList)\n def tearDown(self):\n plt.close(\"all\")", "apis": ["matplotlib.pyplot.subplots", "numpy.arange"], "libs": ["numpy", "matplotlib"], "doc": {"description": ["Draws a histogram of the values in a list and returns the plot's Axes.", "For visualization:", "- Bin edges are adjusted to align with integer values in `myList`.", "- Histogram bars are outlined in black.", "- X-axis label: 'Value'", "- Y-axis label: 'Frequency'", "- Plot title: 'Histogram of Values'"], "note": [], "params": ["myList (list): List of numerical values to plot."], "returns": ["ax (matplotlib.axes._axes.Axes): Axes object of the histogram plot."], "reqs": ["matplotlib.pyplot", "numpy"], "raises": [], "example": [">>> myList = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]", ">>> ax = f_369(myList)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(0.0, 0, '0.0'), Text(0.5, 0, '0.5'), Text(1.0, 0, '1.0'), Text(1.5, 0, '1.5'), Text(2.0, 0, '2.0'), Text(2.5, 0, '2.5'), Text(3.0, 0, '3.0'), Text(3.5, 0, '3.5'), Text(4.0, 0, '4.0'), Text(4.5, 0, '4.5'), Text(5.0, 0, '5.0')]"]}} +{"task_id": "f_887", "prompt": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Constants\nCATEGORIES = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n\n\ndef f_887(data_list):\n \"\"\"\n Processes a list of category labels to create a histogram that visualizes their distribution.\n This histogram compares the distribution of a predefined set of categories (A, B, C, D, E)\n with any additional categories found in the input list.\n\n Parameters:\n - data_list (list): A list containing category labels (strings).\n\n Returns:\n - Axes object (matplotlib.axes._subplots.Axes): The histogram displaying the distribution of categories.\n\n Requirements:\n - pandas\n - matplotlib\n\n Notes:\n - The function evaluates the distribution of predefined categories ('A', 'B', 'C', 'D', 'E') and checks for uniformity.\n - Categories in the data_list that are not among the predefined categories are identified and included in the histogram.\n - The ax.bar call in the function creates a bar plot on the axes object. It uses the following parameters:\n * all_categories: The categories to be displayed on the x-axis, including both predefined and extra categories.\n * category_counts.reindex(all_categories, fill_value=0): The counts of each category, where categories not found\n in the data_list are assigned a count of 0.\n * width=0.8: Sets the width of the bars in the bar plot.\n * align=\"center\": Aligns the bars with the center of the x-ticks.\n\n Raises:\n - ValueError: If the input data_list is empty, the function raises a ValueError with the message \"The data list is empty.\"\n In this case, no histogram is generated and the function terminates.\n\n\n Example:\n >>> data = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n >>> ax = f_887(data)\n >>> ax.get_xticks()\n array([0., 1., 2., 3., 4., 5., 6.])\n \"\"\"", "canonical_solution": "\n if not data_list:\n raise ValueError(\"The data list is empty.\")\n\n data_series = pd.Series(data_list)\n category_counts = data_series.value_counts()\n\n # Prepare data for predefined categories\n predefined_counts = category_counts.reindex(CATEGORIES, fill_value=0)\n\n # Check for uniformity in predefined categories\n if not all(x == predefined_counts.iloc[0] for x in predefined_counts):\n print(\"The distribution of predefined categories is not uniform.\")\n\n # Handling extra categories not in predefined list\n extra_categories = category_counts.drop(CATEGORIES, errors=\"ignore\").index.tolist()\n all_categories = CATEGORIES + extra_categories\n\n _, ax = plt.subplots()\n ax.bar(\n all_categories,\n category_counts.reindex(all_categories, fill_value=0),\n width=0.8,\n align=\"center\",\n )\n ax.set_xticks(all_categories)\n\n return ax", "test": "import unittest\nfrom unittest.mock import patch\nimport io\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function.\"\"\"\n def test_empty_list(self):\n \"\"\"\n Test the function with an empty list. Expects ValueError.\n \"\"\"\n with self.assertRaises(ValueError):\n f_887([])\n def test_uniform_distribution(self):\n \"\"\"\n Test the function with a uniform distribution of predefined categories.\n Expects no printed warning about non-uniform distribution.\n \"\"\"\n data = [\"A\", \"B\", \"C\", \"D\", \"E\"] * 2\n with patch(\"sys.stdout\", new=io.StringIO()) as fake_output:\n f_887(data)\n self.assertNotIn(\n \"The distribution of predefined categories is not uniform.\",\n fake_output.getvalue(),\n )\n def test_non_uniform_distribution(self):\n \"\"\"\n Test the function with a non-uniform distribution of predefined categories.\n Expects a printed warning about non-uniform distribution.\n \"\"\"\n data = [\"A\", \"A\", \"B\", \"C\", \"D\", \"E\"]\n with patch(\"sys.stdout\", new=io.StringIO()) as fake_output:\n f_887(data)\n self.assertIn(\n \"The distribution of predefined categories is not uniform.\",\n fake_output.getvalue(),\n )\n def test_extra_categories(self):\n \"\"\"\n Test the function with extra categories not in the predefined list.\n Expects extra categories to be included in the histogram.\n \"\"\"\n data = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\", \"G\"]\n ax = f_887(data)\n self.assertIn(\"F\", [tick.get_text() for tick in ax.get_xticklabels()])\n self.assertIn(\"G\", [tick.get_text() for tick in ax.get_xticklabels()])\n def test_no_extra_categories(self):\n \"\"\"\n Test the function with no extra categories.\n Expects only predefined categories to be included in the histogram.\n \"\"\"\n data = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n ax = f_887(data)\n for extra_cat in [\"F\", \"G\"]:\n self.assertNotIn(\n extra_cat, [tick.get_text() for tick in ax.get_xticklabels()]\n )\n def tearDown(self):\n plt.clf()", "apis": ["matplotlib.pyplot.subplots", "pandas.Series"], "libs": ["pandas", "matplotlib"], "doc": {"description": ["Processes a list of category labels to create a histogram that visualizes their distribution.", "This histogram compares the distribution of a predefined set of categories (A, B, C, D, E)", "with any additional categories found in the input list.", "Notes:", "- The function evaluates the distribution of predefined categories ('A', 'B', 'C', 'D', 'E') and checks for uniformity.", "- Categories in the data_list that are not among the predefined categories are identified and included in the histogram.", "- The ax.bar call in the function creates a bar plot on the axes object. It uses the following parameters:", "* all_categories: The categories to be displayed on the x-axis, including both predefined and extra categories.", "* category_counts.reindex(all_categories, fill_value=0): The counts of each category, where categories not found", "in the data_list are assigned a count of 0.", "* width=0.8: Sets the width of the bars in the bar plot.", "* align=\"center\": Aligns the bars with the center of the x-ticks."], "note": [], "params": ["data_list (list): A list containing category labels (strings)."], "returns": ["Axes object (matplotlib.axes._subplots.Axes): The histogram displaying the distribution of categories."], "reqs": ["pandas", "matplotlib"], "raises": ["ValueError: If the input data_list is empty, the function raises a ValueError with the message \"The data list is empty.\"", "In this case, no histogram is generated and the function terminates."], "example": [">>> data = ['A', 'B', 'C', 'D', 'E', 'F', 'G']", ">>> ax = f_887(data)", ">>> ax.get_xticks()", "array([0., 1., 2., 3., 4., 5., 6.])"]}} +{"task_id": "f_789", "prompt": "import numpy as np\nimport pandas as pd\nimport random\n\ndef f_789(rows=3, cols=2, min_val=0, max_val=100, seed=0):\n \"\"\"\n Creates a matrix of specified dimensions with random integers within a given range,\n and then converts it into a pandas DataFrame.\n \n Parameters:\n - rows (int): Number of rows in the matrix. Default is 3.\n - cols (int): Number of columns in the matrix. Default is 2.\n - min_val (int): Minimum integer value for the random integers. Default is 0.\n - max_val (int): Maximum integer value for the random integers. Default is 100.\n \n Returns:\n DataFrame: A pandas DataFrame containing random integers within the specified range.\n \n Requirements:\n - numpy\n - pandas\n - random\n\n Example:\n >>> df = f_789(3, 2, 0, 100)\n >>> print(type(df))\n \n >>> print(df.shape)\n (3, 2)\n \"\"\"", "canonical_solution": " random.seed(seed)\n if min_val == max_val:\n matrix = np.full((rows, cols), min_val)\n else:\n matrix = np.array([[random.randrange(min_val, max_val) for j in range(cols)] for i in range(rows)])\n df = pd.DataFrame(matrix)\n return df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = f_789()\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.iloc[:, 0].tolist(), [49, 53, 33])\n self.assertEqual(df.iloc[:, 1].tolist(), [97, 5, 65])\n \n def test_case_2(self):\n df = f_789(rows=5, cols=4)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(df.iloc[:, 0].tolist(), [49, 33, 38, 27, 17])\n self.assertEqual(df.iloc[:, 1].tolist(), [97, 65, 61, 64, 96])\n self.assertEqual(df.iloc[:, 2].tolist(), [53, 62, 45, 17, 12])\n def test_case_3(self):\n df = f_789(min_val=10, max_val=20)\n self.assertEqual(df.iloc[:, 0].tolist(), [16, 10, 18])\n self.assertEqual(df.iloc[:, 1].tolist(), [16, 14, 17])\n \n def test_case_4(self):\n df = f_789(min_val=50, max_val=50)\n self.assertEqual(df.iloc[:, 0].tolist(), [50, 50, 50])\n self.assertEqual(df.iloc[:, 1].tolist(), [50, 50, 50])\n def test_case_5(self):\n df = f_789(rows=0, cols=2)\n self.assertTrue(df.empty)", "apis": ["pandas.DataFrame", "numpy.array", "numpy.full", "random.randrange", "random.seed"], "libs": ["random", "numpy", "pandas"], "doc": {"description": ["Creates a matrix of specified dimensions with random integers within a given range,", "and then converts it into a pandas DataFrame."], "note": [], "params": ["rows (int): Number of rows in the matrix. Default is 3.", "cols (int): Number of columns in the matrix. Default is 2.", "min_val (int): Minimum integer value for the random integers. Default is 0.", "max_val (int): Maximum integer value for the random integers. Default is 100."], "returns": ["DataFrame: A pandas DataFrame containing random integers within the specified range."], "reqs": ["numpy", "pandas", "random"], "raises": [], "example": [">>> df = f_789(3, 2, 0, 100)", ">>> print(type(df))", "", ">>> print(df.shape)", "(3, 2)"]}} +{"task_id": "f_533", "prompt": "import csv\nimport sys\n\ndef f_533(filename):\n \"\"\"\n Read a CSV file, inverse the order of the lines and write the inverted lines back into the file. Then reset the cursor to the beginning of the file.\n\n Parameters:\n - filename (str): The name of the CSV file.\n\n Returns:\n - filename (str): The name of the CSV file.\n\n Requirements:\n - csv\n - sys\n\n Example:\n >>> f_533('file.csv')\n 'file.csv'\n \"\"\"", "canonical_solution": " try:\n with open(filename, 'r+') as file:\n reader = csv.reader(file)\n rows = list(reader)\n file.seek(0)\n file.truncate()\n\n writer = csv.writer(file)\n writer.writerows(reversed(rows))\n\n file.seek(0)\n except Exception as e:\n print(f\"An error occurred: {e}\", file=sys.stderr)\n\n return filename", "test": "import unittest\nimport os\nclass TestCases(unittest.TestCase):\n def base(self, filename, contents, expected):\n # Create file\n with open(filename, 'w') as file:\n file.write(contents)\n # Run function\n f_533(filename)\n # Check file\n with open(filename, 'r') as file:\n txt = file.read()\n self.assertEqual(txt, expected)\n # Remove file\n os.remove(filename)\n def test_case_1(self):\n self.base('file.csv', \"a,b\\nc,d\\ne,f\\ng,h\\n\", \"g,h\\ne,f\\nc,d\\na,b\\n\")\n \n def test_case_2(self):\n self.base('file.csv', \"a,b,c\\nd,e,f\\ng,h,i\\n\", \"g,h,i\\nd,e,f\\na,b,c\\n\")\n def test_case_3(self):\n self.base('file.csv', \"a,b,c,d\\ne,f,g,h\\ni,j,k,l\\n\", \"i,j,k,l\\ne,f,g,h\\na,b,c,d\\n\")\n \n def test_case_4(self):\n self.base('file.csv', \"a,b,c,d,e\\nf,g,h,i,j\\nk,l,m,n,o\\n\", \"k,l,m,n,o\\nf,g,h,i,j\\na,b,c,d,e\\n\")\n def test_case_5(self):\n self.base('file.csv', \"a,b,c,d,e,f\\ng,h,i,j,k,l\\nm,n,o,p,q,r\\n\", \"m,n,o,p,q,r\\ng,h,i,j,k,l\\na,b,c,d,e,f\\n\")", "apis": ["csv.reader", "csv.writer", "sys.stderr"], "libs": ["csv", "sys"], "doc": {"description": ["Read a CSV file, inverse the order of the lines and write the inverted lines back into the file. Then reset the cursor to the beginning of the file."], "note": [], "params": ["filename (str): The name of the CSV file."], "returns": ["filename (str): The name of the CSV file."], "reqs": ["csv", "sys"], "raises": [], "example": [">>> f_533('file.csv')", "'file.csv'"]}} +{"task_id": "f_908", "prompt": "import numpy as np\nimport seaborn as sns\n\n\ndef f_908(arr):\n \"\"\"\n Plots a heatmap of a given 2D numerical array and prints the sum of each row.\n The heatmap's color range is set based on the minimum and maximum values in the array.\n\n Parameters:\n arr (numpy.array): A 2D numpy array of numerical values.\n\n Returns:\n ax (matplotlib.axes.Axes): The Axes object with the plotted heatmap.\n\n Requirements:\n - numpy\n - seaborn\n\n Note:\n The function calculates the sum of each row and prints these values.\n The heatmap is plotted based on the original array with its color range set from the minimum to the maximum value in the array.\n\n Example:\n >>> arr = np.array([[i + j for i in range(3)] for j in range(5)])\n >>> ax = f_908(arr)\n >>> ax.get_title()\n 'Heatmap of the 2D Array'\n \"\"\"", "canonical_solution": " row_sums = arr.sum(axis=1)\n vmax = np.max(arr) # Set vmax to the maximum value in the array\n vmin = np.min(arr) # Set vmin to the minimum value in the array\n ax = sns.heatmap(\n arr, annot=True, vmax=vmax, vmin=vmin\n ) # Include both vmin and vmax in the heatmap call\n ax.set_title(\"Heatmap of the 2D Array\")\n\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_908.\"\"\"\n def tearDown(self):\n plt.clf()\n def test_scenario_1(self):\n \"\"\"Scenario 1: Testing with a 2D array created by adding row and column indices.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertEqual(ax.collections[0].colorbar.vmax, expected_vmax)\n def test_scenario_2(self):\n \"\"\"Scenario 2: Testing with a 2D array where each column has identical values based on the column index.\"\"\"\n arr = np.array([[i for i in range(3)] for j in range(5)])\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertEqual(ax.collections[0].colorbar.vmax, expected_vmax)\n def test_scenario_3(self):\n \"\"\"Scenario 3: Testing with a 2D array where each row has identical values based on the row index.\"\"\"\n arr = np.array([[j for i in range(3)] for j in range(5)])\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertEqual(ax.collections[0].colorbar.vmax, expected_vmax)\n def test_scenario_4(self):\n \"\"\"Scenario 4: Testing with a 2D array of zeros.\"\"\"\n arr = np.zeros((5, 3))\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertAlmostEqual(\n ax.collections[0].colorbar.vmax, expected_vmax, delta=0.2\n )\n def test_scenario_5(self):\n \"\"\"Scenario 5: Testing with a 2D array of ones.\"\"\"\n arr = np.ones((5, 3))\n expected_vmax = np.max(arr) # Calculate the expected vmax\n ax = f_908(arr)\n self.assertEqual(ax.get_title(), \"Heatmap of the 2D Array\")\n self.assertAlmostEqual(\n ax.collections[0].colorbar.vmax, expected_vmax, delta=0.2\n )", "apis": ["numpy.min", "seaborn.heatmap", "numpy.max"], "libs": ["seaborn", "numpy"], "doc": {"description": ["Plots a heatmap of a given 2D numerical array and prints the sum of each row.", "The heatmap's color range is set based on the minimum and maximum values in the array."], "note": ["The function calculates the sum of each row and prints these values.", "The heatmap is plotted based on the original array with its color range set from the minimum to the maximum value in the array."], "params": ["arr (numpy.array): A 2D numpy array of numerical values."], "returns": ["ax (matplotlib.axes.Axes): The Axes object with the plotted heatmap."], "reqs": ["numpy", "seaborn"], "raises": [], "example": [">>> arr = np.array([[i + j for i in range(3)] for j in range(5)])", ">>> ax = f_908(arr)", ">>> ax.get_title()", "'Heatmap of the 2D Array'"]}} +{"task_id": "f_792", "prompt": "import numpy as np\nimport pandas as pd\n\ndef f_792(rows, columns, seed=None):\n \"\"\"\n Generate a DataFrame with random values within a specified range.\n \n This function creates a matrix of given dimensions filled with random values between 0 and 1 and returns it as a Pandas DataFrame. Users have the option to set a random seed for reproducible results.\n \n Parameters:\n - rows (int): The number of rows for the matrix.\n - columns (int): The number of columns for the matrix.\n - seed (int, optional): The seed for the random number generator. Default is None.\n \n Returns:\n - DataFrame: A Pandas DataFrame containing the generated random values.\n \n Examples:\n >>> df = f_792(3, 2, seed=42)\n >>> print(df.shape)\n (3, 2)\n \n >>> df = f_792(1, 1, seed=24)\n >>> print(df.shape)\n (1, 1)\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n matrix = np.random.rand(rows, columns)\n df = pd.DataFrame(matrix)\n \n return df", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def setUp(self):\n self.seed = 42\n def test_case_1(self):\n df = f_792(3, 2, seed=self.seed)\n self.assertEqual(df.shape, (3, 2))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())\n \n def test_case_2(self):\n df = f_792(5, 5, seed=self.seed)\n self.assertEqual(df.shape, (5, 5))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())\n \n def test_case_3(self):\n df = f_792(1, 1, seed=self.seed)\n self.assertEqual(df.shape, (1, 1))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())\n \n def test_case_4(self):\n df = f_792(4, 3, seed=self.seed)\n self.assertEqual(df.shape, (4, 3))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())\n \n def test_case_5(self):\n df = f_792(2, 2, seed=self.seed)\n self.assertEqual(df.shape, (2, 2))\n self.assertTrue((df >= 0).all().all())\n self.assertTrue((df <= 1).all().all())", "apis": ["pandas.DataFrame", "numpy.random", "numpy.random.seed", "numpy.random.rand"], "libs": ["numpy", "pandas"], "doc": {"description": ["Generate a DataFrame with random values within a specified range.", "This function creates a matrix of given dimensions filled with random values between 0 and 1 and returns it as a Pandas DataFrame. Users have the option to set a random seed for reproducible results.", ">>> df = f_792(1, 1, seed=24)", ">>> print(df.shape)", "(1, 1)"], "note": [], "params": ["rows (int): The number of rows for the matrix.", "columns (int): The number of columns for the matrix.", "seed (int, optional): The seed for the random number generator. Default is None."], "returns": ["DataFrame: A Pandas DataFrame containing the generated random values."], "reqs": [], "raises": [], "example": ["Examples:", ">>> df = f_792(3, 2, seed=42)", ">>> print(df.shape)", "(3, 2)"]}} +{"task_id": "f_540", "prompt": "import pandas as pd\nfrom collections import Counter\n\ndef f_540(df):\n \"\"\"\n Calculate the frequency of combinations of elements in a DataFrame.\n The function adds a 'combination' column to the DataFrame, which is the combination of items in each row.\n It then calculates the frequency of each combination.\n \n Parameters:\n - df (pandas.DataFrame): The input DataFrame with columns 'item1', 'item2', 'item3', 'item4', 'item5'.\n \n Returns:\n - dict: A dictionary containing the frequency of all combination.\n\n Requirements:\n - pandas\n - collections\n\n Example:\n >>> df = pd.DataFrame({'item1': ['a', 'b', 'a'], 'item2': ['b', 'c', 'b'], 'item3': ['c', 'd', 'c'], 'item4': ['d', 'e', 'd'], 'item5': ['e', 'f', 'e']})\n >>> f_540(df)\n {('a', 'b', 'c', 'd', 'e'): 2, ('b', 'c', 'd', 'e', 'f'): 1}\n \"\"\"", "canonical_solution": " df['combination'] = pd.Series(df.apply(lambda row: tuple(sorted(row)), axis=1))\n \n # Using Counter from collections to calculate the frequency of each combination\n combination_freq = Counter(df['combination'])\n \n return dict(combination_freq)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'item1': ['a', 'b', 'a'], 'item2': ['b', 'c', 'b'], 'item3': ['c', 'd', 'c'], 'item4': ['d', 'e', 'd'], 'item5': ['e', 'f', 'e']})\n freq = f_540(df)\n self.assertEqual(freq[('a', 'b', 'c', 'd', 'e')], 2)\n self.assertEqual(freq[('b', 'c', 'd', 'e', 'f')], 1)\n def test_case_2(self):\n df = pd.DataFrame({'item1': ['c', 'b', 'a'], 'item2': ['b', 'c', 'b'], 'item3': ['c', 'd', 'c'], 'item4': ['d', 'e', 'd'], 'item5': ['e', 'f', 'e']})\n freq = f_540(df)\n print(freq)\n self.assertEqual(freq[('a', 'b', 'c', 'd', 'e')], 1)\n self.assertEqual(freq[('b', 'c', 'd', 'e', 'f')], 1)\n if ('b', 'c', 'c', 'd', 'e') in freq:\n self.assertEqual(freq[('b', 'c', 'c', 'd', 'e')], 1)\n elif ('c', 'b', 'c', 'd', 'e') in freq:\n self.assertEqual(freq[('c', 'b', 'c', 'd', 'e')], 1)\n def test_case_3(self):\n df = pd.DataFrame({'item1': ['a'], 'item2': ['a'], 'item3': ['a'], 'item4': ['a'], 'item5': ['a']})\n freq = f_540(df)\n self.assertEqual(freq[('a', 'a', 'a', 'a', 'a')], 1)\n def test_case_4(self):\n df = pd.DataFrame({'item1': ['a', 'b', 'c'], 'item2': ['b', 'c', 'd'], 'item3': ['c', 'd', 'e'], 'item4': ['d', 'e', 'f'], 'item5': ['e', 'f', 'g']})\n freq = f_540(df)\n self.assertEqual(freq[('a', 'b', 'c', 'd', 'e')], 1)\n self.assertEqual(freq[('b', 'c', 'd', 'e', 'f')], 1)\n self.assertEqual(freq[('c', 'd', 'e', 'f', 'g')], 1)\n def test_case_5(self):\n df = pd.DataFrame({'item1': ['a', 'a', 'a'], 'item2': ['b', 'b', 'b'], 'item3': ['c', 'c', 'c'], 'item4': ['d', 'd', 'd'], 'item5': ['e', 'e', 'e']})\n freq = f_540(df)\n self.assertEqual(freq[('a', 'b', 'c', 'd', 'e')], 3)", "apis": ["collections.Counter", "pandas.Series"], "libs": ["collections", "pandas"], "doc": {"description": ["Calculate the frequency of combinations of elements in a DataFrame.", "The function adds a 'combination' column to the DataFrame, which is the combination of items in each row.", "It then calculates the frequency of each combination."], "note": [], "params": ["df (pandas.DataFrame): The input DataFrame with columns 'item1', 'item2', 'item3', 'item4', 'item5'."], "returns": ["dict: A dictionary containing the frequency of all combination."], "reqs": ["pandas", "collections"], "raises": [], "example": [">>> df = pd.DataFrame({'item1': ['a', 'b', 'a'], 'item2': ['b', 'c', 'b'], 'item3': ['c', 'd', 'c'], 'item4': ['d', 'e', 'd'], 'item5': ['e', 'f', 'e']})", ">>> f_540(df)", "{('a', 'b', 'c', 'd', 'e'): 2, ('b', 'c', 'd', 'e', 'f'): 1}"]}} +{"task_id": "f_905", "prompt": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy.stats import norm\n\n\ndef f_905(arr: np.ndarray) -> (plt.Axes, np.ndarray):\n \"\"\"\n Plots a histogram of normalized data from an input 2D numpy array alongside the probability density function (PDF)\n of a standard normal distribution.\n\n Note:\n - Takes in a 2D numpy array as input.\n - Calculates the sum of elements in each row of the array.\n - Normalizes these row sums to have a mean of 0 and a standard deviation of 1.\n - Normalization is achieved by first calculating the mean and standard deviation of the row sums.\n - Each row sum is then transformed by subtracting the mean and dividing by the standard deviation.\n - If the standard deviation is 0 (indicating all row sums are equal), normalization results in an array of zeros with the same shape.\n - Plots a histogram of the normalized data.\n - Uses 30 bins for the histogram.\n - The histogram is density-based, meaning it represents the probability density rather than raw frequencies.\n - The bars of the histogram are semi-transparent (60% opacity) and green in color.\n - Overlays the PDF of a standard normal distribution on the histogram for comparison.\n - The PDF curve is plotted in red with a line width of 2.\n - The range of the PDF curve is set to cover 99% of a standard normal distribution.\n - Sets the title of the plot to \"Histogram of Normalized Data with Standard Normal PDF\".\n\n Parameters:\n - arr: A 2D numpy array. The array should contain numerical data.\n\n Returns:\n - A tuple containing:\n - A matplotlib Axes object with the histogram of the normalized data and the overlaid standard normal PDF.\n - The normalized data as a 1D numpy array.\n\n Requirements:\n - numpy\n - scipy\n - matplotlib\n\n Example:\n >>> ax, normalized_data = f_905(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))\n >>> type(ax)\n \n >>> print(normalized_data)\n [-1.22474487 0. 1.22474487]\n \"\"\"", "canonical_solution": " # Calculating row sums\n row_sums = arr.sum(axis=1)\n\n # Normalizing the data\n mean = np.mean(row_sums)\n std_dev = np.std(row_sums)\n normalized_data = (\n (row_sums - mean) / std_dev if std_dev != 0 else np.zeros_like(row_sums)\n )\n\n # Plotting the histogram\n _, ax = plt.subplots()\n ax.hist(normalized_data, bins=30, density=True, alpha=0.6, color=\"g\")\n\n # Plotting the PDF of a standard normal distribution\n x = np.linspace(norm.ppf(0.01), norm.ppf(0.99), 100)\n ax.plot(x, norm.pdf(x), \"r-\", lw=2)\n ax.set_title(\"Histogram of Normalized Data with Standard Normal PDF\")\n\n return ax, normalized_data", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for `f_905`.\"\"\"\n def test_histogram_and_pdf(self):\n \"\"\"Test that the histogram and PDF are plotted.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n ax, _ = f_905(arr)\n self.assertEqual(\n ax.get_title(),\n \"Histogram of Normalized Data with Standard Normal PDF\",\n )\n self.assertEqual(len(ax.lines), 1)\n self.assertEqual(len(ax.patches), 30)\n def test_normalized_data(self):\n \"\"\"Test that the normalized data is correct.\"\"\"\n arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n _, normalized_data = f_905(arr)\n expected_data = [-1.22474487, 0.0, 1.22474487]\n for i in range(len(expected_data)):\n self.assertTrue(np.isclose(normalized_data[i], expected_data[i]))\n def test_empty_array(self):\n \"\"\"Test empty array.\"\"\"\n arr = np.array([[], [], []])\n _, normalized_data = f_905(arr)\n for value in normalized_data:\n self.assertTrue(np.isclose(value, 0))\n def test_single_value_array(self):\n \"\"\"Test single value array.\"\"\"\n arr = np.array([[5], [5], [5]])\n _, normalized_data = f_905(arr)\n for value in normalized_data:\n self.assertTrue(np.isclose(value, 0))\n def test_large_values(self):\n \"\"\"Test large values.\"\"\"\n arr = np.array([[1e6, 2e6, 3e6], [4e6, 5e6, 6e6], [7e6, 8e6, 9e6]])\n _, normalized_data = f_905(arr)\n expected_data = [-1.22474487, 0.0, 1.22474487]\n for i in range(len(expected_data)):\n self.assertTrue(np.isclose(normalized_data[i], expected_data[i]))", "apis": ["numpy.zeros_like", "numpy.mean", "scipy.stats.norm.pdf", "scipy.stats.norm.ppf", "numpy.ndarray", "numpy.linspace", "matplotlib.pyplot.subplots", "numpy.std", "matplotlib.pyplot.Axes"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Plots a histogram of normalized data from an input 2D numpy array alongside the probability density function (PDF)", "of a standard normal distribution."], "note": ["Takes in a 2D numpy array as input.", "Calculates the sum of elements in each row of the array.", "Normalizes these row sums to have a mean of 0 and a standard deviation of 1.", "Normalization is achieved by first calculating the mean and standard deviation of the row sums.", "Each row sum is then transformed by subtracting the mean and dividing by the standard deviation.", "If the standard deviation is 0 (indicating all row sums are equal), normalization results in an array of zeros with the same shape.", "Plots a histogram of the normalized data.", "Uses 30 bins for the histogram.", "The histogram is density-based, meaning it represents the probability density rather than raw frequencies.", "The bars of the histogram are semi-transparent (60% opacity) and green in color.", "Overlays the PDF of a standard normal distribution on the histogram for comparison.", "The PDF curve is plotted in red with a line width of 2.", "The range of the PDF curve is set to cover 99% of a standard normal distribution.", "Sets the title of the plot to \"Histogram of Normalized Data with Standard Normal PDF\"."], "params": ["arr: A 2D numpy array. The array should contain numerical data."], "returns": ["A tuple containing:", "A matplotlib Axes object with the histogram of the normalized data and the overlaid standard normal PDF.", "The normalized data as a 1D numpy array."], "reqs": ["numpy", "scipy", "matplotlib"], "raises": [], "example": [">>> ax, normalized_data = f_905(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))", ">>> type(ax)", "", ">>> print(normalized_data)", "[-1.22474487 0. 1.22474487]"]}} +{"task_id": "f_345", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\n\n\ndef f_345(P, T):\n \"\"\"\n Calculate the product of matrix \"P\" and 3D tensor \"T\" then return dataframe of normalized results.\n\n This function performs matrix-tensor multiplication between a matrix \"P\" and a 3D tensor \"T\" using numpy.\n It checks if the shapes of P and T are compatible for multiplication, raising a ValueError if they are not.\n The function then normalizes the resulting 2D array using sklearn's StandardScaler. The final output\n is returned as a pandas DataFrame, with columns named feature_0, feature_1, ..., feature_n,\n where n is the number of features in the flattened result of the matrix-tensor multiplication.\n\n Parameters:\n - P (numpy.ndarray): The input matrix. Must not be empty.\n - T (numpy.ndarray): The input tensor. Must not be empty.\n\n Returns:\n pandas.DataFrame: A DataFrame with the normalized result.\n\n Requirements:\n - numpy\n - pandas\n - sklearn.preprocessing\n\n Example:\n >>> np.random.seed(0)\n >>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])\n >>> T = np.random.rand(3, 5, 5)\n >>> result = f_345(P, T)\n >>> type(result)\n \n >>> result.head(2)\n feature_0 feature_1 feature_2 ... feature_22 feature_23 feature_24\n 0 0.214791 0.220904 1.697850 ... 1.768847 -1.759510 -0.003527\n 1 -0.652336 1.064228 -0.707134 ... -0.036116 1.002544 -0.813796\n \n [2 rows x 25 columns]\n \"\"\"", "canonical_solution": " if P.size == 0 or T.size == 0:\n raise ValueError(\"Inputs cannot be empty.\")\n if P.shape[1] != T.shape[0]:\n raise ValueError(\n f\"Matrix P shape {P.shape[1]} and Tensor T shape {T.shape[0]} are incompatible for tensor multiplication.\"\n )\n\n result = np.tensordot(P, T, axes=[1, 0]).swapaxes(0, 1)\n result = result.reshape(result.shape[0], -1)\n\n scaler = StandardScaler()\n result = scaler.fit_transform(result)\n\n adjusted_feature_names = [f\"feature_{i}\" for i in range(result.shape[1])]\n result = pd.DataFrame(result, columns=adjusted_feature_names)\n\n return result", "test": "import unittest\nimport numpy as np\nfrom sklearn.preprocessing import StandardScaler\nclass TestCases(unittest.TestCase):\n def tensor_product_manual(self, P, T):\n \"\"\"Manually compute the tensor product without any normalization.\"\"\"\n result = np.tensordot(P, T, axes=[1, 0]).swapaxes(0, 1)\n result = result.reshape(result.shape[0], -1)\n return result\n def test_case_1(self):\n np.random.seed(0)\n P = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n T = np.random.rand(3, 4, 4)\n result = f_345(P, T)\n manual_result = self.tensor_product_manual(P, T)\n # Reverse normalization for comparison\n scaler = StandardScaler().fit(manual_result)\n reversed_result = scaler.inverse_transform(result)\n self.assertEqual(result.shape, (4, 12))\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.allclose(manual_result, reversed_result, atol=1e-5))\n def test_case_2(self):\n np.random.seed(0)\n P = np.array([[1, 2], [3, 4], [5, 6]])\n T = np.random.rand(3, 5, 5)\n with self.assertRaises(ValueError):\n f_345(P, T)\n def test_case_3(self):\n np.random.seed(0)\n P = np.eye(4)\n T = np.random.rand(4, 6, 6)\n result = f_345(P, T)\n manual_result = self.tensor_product_manual(P, T)\n # Reverse normalization for comparison\n scaler = StandardScaler().fit(manual_result)\n reversed_result = scaler.inverse_transform(result)\n self.assertEqual(result.shape, (6, 24))\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.allclose(manual_result, reversed_result, atol=1e-5))\n def test_case_4(self):\n np.random.seed(0)\n P = np.ones((5, 5))\n T = np.random.rand(5, 7, 7)\n result = f_345(P, T)\n manual_result = self.tensor_product_manual(P, T)\n # Reverse normalization for comparison\n scaler = StandardScaler().fit(manual_result)\n reversed_result = scaler.inverse_transform(result)\n self.assertEqual(result.shape, (7, 35))\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.allclose(manual_result, reversed_result, atol=1e-5))\n def test_case_5(self):\n np.random.seed(0)\n P = np.diag(np.arange(1, 7))\n T = np.random.rand(6, 8, 8)\n result = f_345(P, T)\n manual_result = self.tensor_product_manual(P, T)\n # Reverse normalization for comparison\n scaler = StandardScaler().fit(manual_result)\n reversed_result = scaler.inverse_transform(result)\n self.assertEqual(result.shape, (8, 48))\n self.assertTrue(np.isclose(result.mean().mean(), 0, atol=1e-5))\n self.assertTrue(np.allclose(manual_result, reversed_result, atol=1e-5))\n def test_case_6(self):\n # Test with an empty matrix and tensor, expecting a ValueError due to incompatible shapes\n P = np.array([])\n T = np.array([])\n with self.assertRaises(ValueError):\n f_345(P, T)\n def test_case_7(self):\n # Test with non-numeric inputs in matrices/tensors to verify type handling\n P = np.array([[\"a\", \"b\"], [\"c\", \"d\"]])\n T = np.random.rand(2, 2, 2)\n with self.assertRaises(Exception):\n f_345(P, T)\n def test_case_8(self):\n # Test with zero matrix and tensor to verify handling of all-zero inputs\n P = np.zeros((5, 5))\n T = np.zeros((5, 3, 3))\n result = f_345(P, T)\n self.assertTrue(np.allclose(result, np.zeros((3, 15))))\n def test_case_9(self):\n # Test DataFrame output for correct column names, ensuring they match expected feature naming convention\n P = np.random.rand(3, 3)\n T = np.random.rand(3, 4, 4)\n result = f_345(P, T)\n expected_columns = [\n \"feature_0\",\n \"feature_1\",\n \"feature_2\",\n \"feature_3\",\n \"feature_4\",\n \"feature_5\",\n \"feature_6\",\n \"feature_7\",\n \"feature_8\",\n \"feature_9\",\n \"feature_10\",\n \"feature_11\",\n ]\n self.assertListEqual(list(result.columns), expected_columns)\n def test_case_10(self):\n # Test to ensure DataFrame indices start from 0 and are sequential integers\n P = np.random.rand(2, 3)\n T = np.random.rand(3, 5, 5)\n result = f_345(P, T)\n expected_indices = list(range(5)) # Expected indices for 5 rows\n self.assertListEqual(list(result.index), expected_indices)", "apis": ["pandas.DataFrame", "sklearn.preprocessing.StandardScaler", "numpy.tensordot"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Calculate the product of matrix \"P\" and 3D tensor \"T\" then return dataframe of normalized results.", "This function performs matrix-tensor multiplication between a matrix \"P\" and a 3D tensor \"T\" using numpy.", "It checks if the shapes of P and T are compatible for multiplication, raising a ValueError if they are not.", "The function then normalizes the resulting 2D array using sklearn's StandardScaler. The final output", "is returned as a pandas DataFrame, with columns named feature_0, feature_1, ..., feature_n,", "where n is the number of features in the flattened result of the matrix-tensor multiplication."], "note": [], "params": ["P (numpy.ndarray): The input matrix. Must not be empty.", "T (numpy.ndarray): The input tensor. Must not be empty."], "returns": ["pandas.DataFrame: A DataFrame with the normalized result."], "reqs": ["numpy", "pandas", "sklearn.preprocessing"], "raises": [], "example": [">>> np.random.seed(0)", ">>> P = np.array([[6, 2, 7], [1, 1, 8], [8, 7, 1], [9, 6, 4], [2, 1, 1]])", ">>> T = np.random.rand(3, 5, 5)", ">>> result = f_345(P, T)", ">>> type(result)", "", ">>> result.head(2)", "feature_0 feature_1 feature_2 ... feature_22 feature_23 feature_24", "0 0.214791 0.220904 1.697850 ... 1.768847 -1.759510 -0.003527", "1 -0.652336 1.064228 -0.707134 ... -0.036116 1.002544 -0.813796", "", "[2 rows x 25 columns]"]}} +{"task_id": "f_850", "prompt": "import requests\nimport pandas as pd\n\n\ndef f_850(url: str) -> pd.DataFrame:\n \"\"\"\n This function fetches JSON data from a specified URL and converts it into a Pandas DataFrame.\n It expects the JSON to be in a format that is directly convertible to a DataFrame, typically\n a list of dictionaries. The function handles various scenarios including successful data\n retrieval and conversion, network issues, and invalid JSON format.\n\n Parameters:\n - url (str): The URL where the JSON file is located.\n\n Returns:\n - pd.DataFrame: A DataFrame constructed from the JSON data fetched from the URL.\n\n Raises:\n - SystemError: If there is a network-related issue such as a connection error, timeout,\n or if the server responded with an unsuccessful status code (like 404 or 500). This is a\n re-raised exception from requests.RequestException to provide a more specific error message.\n - ValueError: If the fetched data is not in a valid JSON format that can be converted into\n a DataFrame. This could occur if the data structure does not match the expected format (e.g.,\n not a list of dictionaries).\n\n Requirements:\n - requests\n - pandas\n\n Example:\n >>> f_850('https://example.com/data.json')\n DataFrame:\n A B\n\n Notes:\n - The function uses a timeout of 5 seconds for the network request to avoid hanging indefinitely.\n - It checks the HTTP response status and raises an HTTPError for unsuccessful status codes.\n - Directly converts the HTTP response to JSON and then to a DataFrame, without intermediate processing.\n \"\"\"", "canonical_solution": " try:\n response = requests.get(url, timeout=5)\n response.raise_for_status() # Raises an HTTPError if the HTTP request returned an unsuccessful status code\n data = response.json() # Directly converts the response content to JSON\n df = pd.DataFrame(data)\n return df\n except requests.RequestException as e:\n raise SystemError(f\"Network error occurred: {e}\") from e\n except ValueError as exc:\n raise ValueError(\"Invalid JSON format for DataFrame conversion\") from exc", "test": "import unittest\nimport requests\nimport pandas as pd\nfrom unittest.mock import patch\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_850.\"\"\"\n @patch(\"requests.get\")\n def test_valid_json(self, mock_get):\n \"\"\"Test a valid JSON.\"\"\"\n mock_get.return_value.json.return_value = [{\"A\": 1, \"B\": 3}, {\"A\": 2, \"B\": 4}]\n mock_get.return_value.status_code = 200\n df = f_850(\"https://example.com/data.json\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertListEqual(df.columns.tolist(), [\"A\", \"B\"])\n self.assertListEqual(df[\"A\"].tolist(), [1, 2])\n self.assertListEqual(df[\"B\"].tolist(), [3, 4])\n @patch(\"requests.get\")\n def test_empty_json(self, mock_get):\n \"\"\"Test an empty JSON.\"\"\"\n mock_get.return_value.json.return_value = []\n mock_get.return_value.status_code = 200\n df = f_850(\"https://example.com/empty.json\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(len(df), 0)\n @patch(\"requests.get\")\n def test_invalid_json(self, mock_get):\n \"\"\"Test an invalid JSON.\"\"\"\n mock_get.return_value.json.side_effect = ValueError()\n with self.assertRaises(ValueError):\n f_850(\"https://example.com/invalid.json\")\n @patch(\"requests.get\")\n def test_large_json(self, mock_get):\n \"\"\"Test a large JSON.\"\"\"\n mock_get.return_value.json.return_value = [{\"X\": i} for i in range(1000)]\n mock_get.return_value.status_code = 200\n df = f_850(\"https://example.com/large.json\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertListEqual(df[\"X\"].tolist(), list(range(1000)))\n @patch(\"requests.get\")\n def test_null_json(self, mock_get):\n \"\"\"Test a JSON that is null.\"\"\"\n mock_get.return_value.json.return_value = None\n mock_get.return_value.status_code = 200\n df = f_850(\"https://example.com/null.json\")\n self.assertTrue(isinstance(df, pd.DataFrame))\n self.assertEqual(len(df), 0)\n @patch(\"requests.get\")\n def test_system_error(self, mock_get):\n \"\"\"Test a general error.\"\"\"\n mock_get.side_effect = requests.RequestException\n with self.assertRaises(SystemError):\n f_850(\"https://example.com/data.json\")", "apis": ["pandas.DataFrame", "requests.RequestException", "requests.get"], "libs": ["pandas", "requests"], "doc": {"description": ["This function fetches JSON data from a specified URL and converts it into a Pandas DataFrame.", "It expects the JSON to be in a format that is directly convertible to a DataFrame, typically", "a list of dictionaries. The function handles various scenarios including successful data", "retrieval and conversion, network issues, and invalid JSON format.", "Notes:", "- The function uses a timeout of 5 seconds for the network request to avoid hanging indefinitely.", "- It checks the HTTP response status and raises an HTTPError for unsuccessful status codes.", "- Directly converts the HTTP response to JSON and then to a DataFrame, without intermediate processing."], "note": [], "params": ["url (str): The URL where the JSON file is located."], "returns": ["pd.DataFrame: A DataFrame constructed from the JSON data fetched from the URL."], "reqs": ["requests", "pandas"], "raises": ["SystemError: If there is a network-related issue such as a connection error, timeout,", "or if the server responded with an unsuccessful status code (like 404 or 500). This is a", "re-raised exception from requests.RequestException to provide a more specific error message.", "ValueError: If the fetched data is not in a valid JSON format that can be converted into", "a DataFrame. This could occur if the data structure does not match the expected format (e.g.,", "not a list of dictionaries)."], "example": [">>> f_850('https://example.com/data.json')", "DataFrame:", "A B"]}} +{"task_id": "f_915", "prompt": "import matplotlib.pyplot as plt\nfrom itertools import cycle\nimport numpy as np\nfrom random import shuffle\n\nCOLORS = [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\"]\n\n\ndef f_915(list_of_lists):\n \"\"\"\n Plots a series of lines for each list in `list_of_lists`. Each line is plotted with shuffled y-values\n and sequential x-values starting from 1. The function shuffles the y-values of each inner list before plotting.\n Each line is plotted with a different color from a predetermined set of colors. The function cycles through \n these colors for each inner list.\n\n Parameters:\n - list_of_lists (list of list): A list of lists where each inner\n list represents a set of y-values to be shuffled and plotted. The x-values are automatically\n generated as a sequence starting from 1 up to the length of the inner list.\n\n Returns:\n - tuple: A tuple containing the figure and axes objects of the plotted graph.\n\n Requirements:\n - matplotlib\n - itertools\n - numpy\n - random\n\n Example:\n >>> import random\n >>> random.seed(0)\n >>> fig, ax = f_915([[1, 2, 3], [4, 5, 6]])\n >>> ax.lines[0].get_color()\n (0.0, 0.0, 1.0, 1)\n\n Note:\n - If an inner list is empty, it will be skipped and no line will be plotted for it.\n - The colors are reused cyclically if there are more inner lists than colors available.\n - The shuffling of y-values is random and different each time the function is called,\n unless a random seed is set externally.\n - The function uses a default set of colors defined in the COLORS constant.\n \"\"\"", "canonical_solution": " fig, ax = plt.subplots()\n color_cycle = cycle(COLORS)\n\n for list_ in list_of_lists:\n y_values = np.arange(1, len(list_) + 1)\n shuffle(y_values)\n ax.plot(y_values, next(color_cycle))\n\n return fig, ax", "test": "import unittest\nfrom matplotlib.figure import Figure\nfrom matplotlib.axes import Axes\nimport matplotlib.colors as mcolors\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_915.\"\"\"\n def test_return_types(self):\n \"\"\"Check that the function returns the correct types.\"\"\"\n random.seed(0)\n fig, ax = f_915([[\"x\", \"y\", \"z\"], [\"a\", \"b\", \"c\"]])\n self.assertIsInstance(\n fig,\n Figure,\n \"The first return value should be an instance of matplotlib.figure.Figure.\",\n )\n self.assertIsInstance(\n ax,\n Axes,\n \"The second return value should be an instance of matplotlib.axes._subplots.Axes.\",\n )\n def test_number_of_lines(self):\n \"\"\"Check that the correct number of lines are plotted.\"\"\"\n random.seed(1)\n _, ax = f_915([[\"x\", \"y\", \"z\"], [\"a\", \"b\", \"c\"]])\n self.assertEqual(\n len(ax.lines), 2, \"There should be 2 lines plotted for 2 lists.\"\n )\n _, ax = f_915([[\"x\", \"y\", \"z\"]])\n self.assertEqual(len(ax.lines), 1, \"There should be 1 line plotted for 1 list.\")\n def test_color_cycle(self):\n \"\"\"Check that the colors of the plotted lines follow the specified cycle.\"\"\"\n random.seed(2)\n _, ax = f_915([[\"x\"], [\"y\"], [\"z\"], [\"a\"], [\"b\"], [\"c\"], [\"d\"], [\"e\"]])\n expected_colors = [\"b\", \"g\", \"r\", \"c\", \"m\", \"y\", \"k\", \"b\"]\n # Convert color codes to RGBA format\n expected_colors_rgba = [mcolors.to_rgba(c) for c in expected_colors]\n actual_colors_rgba = [line.get_color() for line in ax.lines]\n self.assertEqual(\n actual_colors_rgba,\n expected_colors_rgba,\n \"The colors of the plotted lines should follow the specified cycle.\",\n )\n def test_y_values(self):\n \"\"\"Check that the y-values are shuffled.\"\"\"\n random.seed(3)\n _, ax = f_915([[\"x\", \"y\", \"z\"]])\n y_data = ax.lines[0].get_ydata()\n self.assertTrue(\n set(y_data) == {1, 2, 3},\n \"The y-values should be shuffled numbers from the range [1, len(list)].\",\n )\n def test_empty_input(self):\n \"\"\"Check that no lines are plotted for an empty input list.\"\"\"\n random.seed(4)\n _, ax = f_915([])\n self.assertEqual(\n len(ax.lines),\n 0,\n \"There should be no lines plotted for an empty input list.\",\n )", "apis": ["matplotlib.pyplot.subplots", "numpy.arange", "random.shuffle", "itertools.cycle"], "libs": ["itertools", "random", "numpy", "matplotlib"], "doc": {"description": ["Plots a series of lines for each list in `list_of_lists`. Each line is plotted with shuffled y-values", "and sequential x-values starting from 1. The function shuffles the y-values of each inner list before plotting.", "Each line is plotted with a different color from a predetermined set of colors. The function cycles through", "these colors for each inner list."], "note": ["If an inner list is empty, it will be skipped and no line will be plotted for it.", "The colors are reused cyclically if there are more inner lists than colors available.", "The shuffling of y-values is random and different each time the function is called,", "unless a random seed is set externally.", "The function uses a default set of colors defined in the COLORS constant."], "params": ["list_of_lists (list of list): A list of lists where each inner", "list represents a set of y-values to be shuffled and plotted. The x-values are automatically", "generated as a sequence starting from 1 up to the length of the inner list."], "returns": ["tuple: A tuple containing the figure and axes objects of the plotted graph."], "reqs": ["matplotlib", "itertools", "numpy", "random"], "raises": [], "example": [">>> import random", ">>> random.seed(0)", ">>> fig, ax = f_915([[1, 2, 3], [4, 5, 6]])", ">>> ax.lines[0].get_color()", "(0.0, 0.0, 1.0, 1)"]}} +{"task_id": "f_810", "prompt": "import numpy as np\nfrom scipy import integrate\nimport matplotlib.pyplot as plt\n\n\ndef f_810(func, x_range=(-2, 2), num_points=1000):\n \"\"\"\n Calculates and plots both a given function and its cumulative integral over a specified range,\n using a linearly spaced range of x-values.\n\n Parameters:\n func (function): A function of a single variable to integrate and plot.\n x_range (tuple, optional): The range (start, end) over which to evaluate `func`. Defaults to (-2, 2).\n num_points (int, optional): Number of points to generate in `x_range`. Defaults to 1000.\n\n Returns:\n matplotlib.axes.Axes: The Axes object containing the plots of the function and its integral.\n\n Requirements:\n - numpy\n - scipy\n - matplotlib\n\n Note:\n - The plot includes a legend and labels for the x and y axes that include the function's name.\n\n Example:\n >>> ax = f_810(np.sin)\n >>> type(ax)\n \n >>> ax.get_legend_handles_labels()[-1]\n ['sin(x)', 'Integral of sin(x)']\n \"\"\"", "canonical_solution": " X = np.linspace(x_range[0], x_range[1], num_points)\n y = func(X)\n y_int = integrate.cumulative_trapezoid(y, X, initial=0)\n\n fig, ax = plt.subplots()\n ax.plot(X, y, label=f\"{func.__name__}(x)\")\n ax.plot(X, y_int, label=f\"Integral of {func.__name__}(x)\")\n ax.legend()\n\n return ax", "test": "import unittest\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n def tearDown(self):\n plt.close(\"all\")\n def helper_assert_plot_attributes(self, func):\n # Test plot attributes are as expected\n ax = f_810(func)\n function_name = func.__name__\n legend_labels = ax.get_legend_handles_labels()[-1]\n self.assertIsInstance(ax, Axes)\n self.assertIn(function_name, legend_labels[0])\n self.assertIn(function_name, legend_labels[1])\n def test_case_1(self):\n # Test basic case in docstring\n ax = f_810(np.sin)\n self.helper_assert_plot_attributes(np.sin)\n def test_case_2(self):\n # Test other functions - numpy\n for func in [np.cos, np.exp]:\n ax = f_810(func)\n self.helper_assert_plot_attributes(func)\n def test_case_3(self):\n # Test other functions - lambda\n func = lambda x: x ** 2\n ax = f_810(func)\n self.helper_assert_plot_attributes(func)\n def test_case_4(self):\n # Test custom range and points\n ax = f_810(np.cos, x_range=(0, np.pi), num_points=500)\n self.assertEqual(len(ax.lines[0].get_xdata()), 500)\n self.assertEqual(ax.lines[0].get_xdata()[0], 0)\n self.assertEqual(ax.lines[0].get_xdata()[-1], np.pi)\n def test_case_5(self):\n # Test correct integral calculation\n # Test integral of x^2 in the range [0,1], should be close to 1/3\n func = lambda x: x ** 2\n X = np.linspace(0, 1, 1000)\n expected_integral = 1 / 3 * X ** 3 # Analytical integral of x^2\n ax = f_810(func, x_range=(0, 1), num_points=1000)\n computed_integral = ax.lines[1].get_ydata()[\n -1\n ] # Last value of the computed integral\n self.assertAlmostEqual(computed_integral, expected_integral[-1], places=4)", "apis": ["matplotlib.pyplot.subplots", "numpy.linspace", "scipy.integrate.cumulative_trapezoid"], "libs": ["numpy", "matplotlib", "scipy"], "doc": {"description": ["Calculates and plots both a given function and its cumulative integral over a specified range,", "using a linearly spaced range of x-values."], "note": ["The plot includes a legend and labels for the x and y axes that include the function's name."], "params": ["func (function): A function of a single variable to integrate and plot.", "x_range (tuple, optional): The range (start, end) over which to evaluate `func`. Defaults to (-2, 2).", "num_points (int, optional): Number of points to generate in `x_range`. Defaults to 1000."], "returns": ["matplotlib.axes.Axes: The Axes object containing the plots of the function and its integral."], "reqs": ["numpy", "scipy", "matplotlib"], "raises": [], "example": [">>> ax = f_810(np.sin)", ">>> type(ax)", "", ">>> ax.get_legend_handles_labels()[-1]", "['sin(x)', 'Integral of sin(x)']"]}} +{"task_id": "f_404", "prompt": "import pandas as pd\nimport numpy as np\nimport statsmodels.api as sm\n\n\ndef f_404(\n array: list, random_seed: int = 0\n) -> (pd.DataFrame, sm.regression.linear_model.RegressionResultsWrapper):\n \"\"\"\n Generate a Pandas DataFrame from a 2D list and perform a multiple linear regression.\n\n The function first validates the input list, creates a DataFrame, separates independent and dependent variables,\n adds a constant to the model, and fits a linear regression using statsmodels.\n\n Parameters:\n - array (list of list of int): A 2D list where each sub-list represents a row of data.\n Each sub-list should have exactly 5 elements, where the first 4 elements are\n treated as independent variables ('A', 'B', 'C', 'D') and the last element is\n the dependent (Response) variable.\n\n - random_seed (int): A seed for reproducibility in numpy for statsmodels. Defaults to 0.\n\n Returns:\n - df (pd.DataFrame): DataFrame with columns 'A', 'B', 'C', 'D', 'Response'.\n - results (statsmodels.RegressionResults): Results of the linear regression.\n\n Requirements:\n - pandas\n - numpy\n - statsmodels.api.sm\n\n Example:\n >>> df, results = f_404([[1,2,3,4,5], [6,7,8,9,10]])\n >>> print(df)\n A B C D Response\n 0 1 2 3 4 5\n 1 6 7 8 9 10\n \"\"\"", "canonical_solution": " COLUMNS = [\"A\", \"B\", \"C\", \"D\", \"Response\"]\n\n np.random.seed(random_seed)\n\n if not all(len(row) == len(COLUMNS) for row in array):\n raise ValueError(\n \"Each sub-list in the input 2D list must have exactly 5 elements.\"\n )\n\n df = pd.DataFrame(array, columns=COLUMNS)\n X = df[COLUMNS[:-1]]\n y = df[\"Response\"]\n X = sm.add_constant(X)\n\n model = sm.OLS(y, X)\n results = model.fit()\n\n return df, results", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Testing dataframe creation, model accuracy, and parameters with various numeric data types\n test_data = [\n ([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]], 42, 1.0), # Positive values\n ([[-1, -2, -3, -4, -5], [-6, -7, -8, -9, -10]], 42, 1.0), # Negative values\n (\n [[100, 200, 300, 400, 500], [600, 700, 800, 900, 1000]],\n 42,\n 1.0,\n ), # Large values\n ]\n for array, random_seed, expected_r2 in test_data:\n with self.subTest(array=array):\n df, results = f_404(array, random_seed=random_seed)\n expected_df = pd.DataFrame(\n array, columns=[\"A\", \"B\", \"C\", \"D\", \"Response\"]\n )\n self.assertTrue(df.equals(expected_df))\n self.assertAlmostEqual(results.rsquared, expected_r2, places=2)\n for param in results.params:\n self.assertNotEqual(param, 0)\n def test_case_2(self):\n # Testing with more rows in the 2D list to ensure model scalability and consistency\n random_seed = 42\n array = [\n [1, 2, 3, 4, 5],\n [6, 7, 8, 9, 10],\n [11, 12, 13, 14, 15],\n [16, 17, 18, 19, 20],\n ]\n df, results = f_404(array, random_seed=random_seed)\n expected_df = pd.DataFrame(array, columns=[\"A\", \"B\", \"C\", \"D\", \"Response\"])\n self.assertTrue(df.equals(expected_df))\n self.assertAlmostEqual(results.rsquared, 1.0, places=2)\n for param in results.params:\n self.assertNotEqual(param, 0)\n def test_case_3(self):\n # Testing input validation for incorrect number of columns in a row\n array = [[1, 2, 3, 4], [5, 6, 7, 8]] # Missing dependent variable\n with self.assertRaises(ValueError):\n f_404(array)\n def test_case_4(self):\n # Testing handling of non-numeric values to ensure type safety\n array = [[\"a\", \"b\", \"c\", \"d\", \"e\"]] # All elements as strings\n with self.assertRaises(ValueError):\n df, results = f_404(array)\n # This assumes the function is modified to catch and raise ValueError for non-numeric inputs\n def test_case_5(self):\n # Testing reproducibility by using the same random_seed\n array = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]\n random_seed = 123\n df1, results1 = f_404(array, random_seed=random_seed)\n df2, results2 = f_404(array, random_seed=random_seed)\n self.assertTrue(df1.equals(df2))\n self.assertEqual(results1.params.tolist(), results2.params.tolist())\n def test_case_6(self):\n # Testing with an empty array to check function's handling of no input data\n array = []\n with self.assertRaises(ValueError):\n f_404(array)", "apis": ["pandas.DataFrame", "numpy.random", "statsmodels.api.OLS", "statsmodels.api.add_constant", "statsmodels.api.regression", "numpy.random.seed"], "libs": ["statsmodels", "numpy", "pandas"], "doc": {"description": ["Generate a Pandas DataFrame from a 2D list and perform a multiple linear regression.", "The function first validates the input list, creates a DataFrame, separates independent and dependent variables,", "adds a constant to the model, and fits a linear regression using statsmodels.", "- random_seed (int): A seed for reproducibility in numpy for statsmodels. Defaults to 0."], "note": [], "params": ["array (list of list of int): A 2D list where each sub-list represents a row of data.", "Each sub-list should have exactly 5 elements, where the first 4 elements are", "treated as independent variables ('A', 'B', 'C', 'D') and the last element is", "the dependent (Response) variable."], "returns": ["df (pd.DataFrame): DataFrame with columns 'A', 'B', 'C', 'D', 'Response'.", "results (statsmodels.RegressionResults): Results of the linear regression."], "reqs": ["pandas", "numpy", "statsmodels.api.sm"], "raises": [], "example": [">>> df, results = f_404([[1,2,3,4,5], [6,7,8,9,10]])", ">>> print(df)", "A B C D Response", "0 1 2 3 4 5", "1 6 7 8 9 10"]}} +{"task_id": "f_370", "prompt": "from collections import Counter\nimport pandas as pd\n\n\ndef f_370(myList):\n \"\"\"\n Count the frequency of each word in a list and return a DataFrame of words and their number.\n\n Parameters:\n myList (list): List of strings. Each string is considered a word regardless of its content,\n however the function is case insensitive, and it removes\n leading and trailing whitespaces. If empty, function returns\n a DataFrame with a Count column that is otherwise empty.\n\n Returns:\n DataFrame: A pandas DataFrame with words and their counts.\n\n Requirements:\n - collections.Counter\n - pandas\n\n Example:\n >>> myList = ['apple', 'banana', 'apple', 'cherry', 'banana', 'banana']\n >>> f_370(myList)\n Count\n apple 2\n banana 3\n cherry 1\n \"\"\"", "canonical_solution": " words = [w.lower().strip() for w in myList]\n word_counts = dict(Counter(words))\n report_df = pd.DataFrame.from_dict(word_counts, orient=\"index\", columns=[\"Count\"])\n\n return report_df", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n input_data = [\"apple\", \"banana\", \"apple\", \"cherry\", \"banana\", \"banana\"]\n expected_output = pd.DataFrame(\n {\"Count\": [2, 3, 1]}, index=[\"apple\", \"banana\", \"cherry\"]\n )\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_2(self):\n # Test repeated value\n input_data = [\"apple\", \"apple\", \"apple\"]\n expected_output = pd.DataFrame({\"Count\": [3]}, index=[\"apple\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_3(self):\n # Test empty list\n input_data = []\n expected_output = pd.DataFrame(columns=[\"Count\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_4(self):\n # Test single entry\n input_data = [\"kiwi\"]\n expected_output = pd.DataFrame({\"Count\": [1]}, index=[\"kiwi\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_5(self):\n # Tests the function's ability to handle mixed case words correctly.\n input_data = [\"Apple\", \"apple\", \"APPLE\"]\n expected_output = pd.DataFrame({\"Count\": [3]}, index=[\"apple\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_6(self):\n # Tests the function's ability to handle words with leading/trailing spaces.\n input_data = [\"banana \", \" banana\", \" banana\"]\n expected_output = pd.DataFrame({\"Count\": [3]}, index=[\"banana\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_7(self):\n # Tests the function's ability to handle words with special characters.\n input_data = [\"kiwi!\", \"!kiwi\", \"kiwi\"]\n expected_output = pd.DataFrame(\n {\"Count\": [1, 1, 1]}, index=[\"kiwi!\", \"!kiwi\", \"kiwi\"]\n )\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_8(self):\n # Tests the function's handling of numeric strings as words.\n input_data = [\"123\", \"456\", \"123\", \"456\", \"789\"]\n expected_output = pd.DataFrame(\n {\"Count\": [2, 2, 1]}, index=[\"123\", \"456\", \"789\"]\n )\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_9(self):\n # Tests the function's handling of empty strings and strings with only spaces.\n input_data = [\" \", \" \", \"\", \"apple\", \"apple \"]\n expected_output = pd.DataFrame({\"Count\": [3, 2]}, index=[\"\", \"apple\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)\n def test_case_10(self):\n # Tests handling of strings that become duplicates after strip() is applied.\n input_data = [\"banana\", \"banana \", \" banana\", \"banana\"]\n expected_output = pd.DataFrame({\"Count\": [4]}, index=[\"banana\"])\n pd.testing.assert_frame_equal(f_370(input_data), expected_output)", "apis": ["pandas.DataFrame", "collections.Counter", "pandas.DataFrame.from_dict"], "libs": ["collections", "pandas"], "doc": {"description": ["Count the frequency of each word in a list and return a DataFrame of words and their number."], "note": [], "params": ["myList (list): List of strings. Each string is considered a word regardless of its content,", "however the function is case insensitive, and it removes", "leading and trailing whitespaces. If empty, function returns", "a DataFrame with a Count column that is otherwise empty."], "returns": ["DataFrame: A pandas DataFrame with words and their counts."], "reqs": ["collections.Counter", "pandas"], "raises": [], "example": [">>> myList = ['apple', 'banana', 'apple', 'cherry', 'banana', 'banana']", ">>> f_370(myList)", "Count", "apple 2", "banana 3", "cherry 1"]}} +{"task_id": "f_826", "prompt": "import pandas as pd\nimport seaborn as sns\nimport numpy as np\n\n\ndef f_826(df):\n \"\"\"\n Generates a pair plot from a numeric DataFrame and calculates its covariance matrix.\n\n Parameters:\n - df (pd.DataFrame): A pandas DataFrame with only numeric columns.\n\n Returns:\n - tuple:\n - covariance_df (pd.DataFrame): The covariance matrix of the input DataFrame.\n - pair_plot (sns.axisgrid.PairGrid): Pair plot of the input DataFrame.\n\n Raises:\n - ValueError: If the DataFrame is empty.\n - TypeError: If the DataFrame contains non-numeric data types.\n\n Requirements:\n - pandas\n - numpy\n - seaborn\n\n Examples:\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n >>> covariance_df, ax = f_826(df)\n >>> type(ax)\n \n >>> covariance_df\n A B C\n A 1.0 1.0 1.0\n B 1.0 1.0 1.0\n C 1.0 1.0 1.0\n \"\"\"", "canonical_solution": " if df.empty:\n raise ValueError(\"DataFrame is empty. Non-empty DataFrame required.\")\n if not all(df.dtypes.apply(lambda x: np.issubdtype(x, np.number))):\n raise TypeError(\n \"DataFrame contains non-numeric data. Only numeric data types are supported.\"\n )\n covariance_df = df.cov()\n pair_plot = sns.pairplot(df)\n\n return covariance_df, pair_plot", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def test_covariance_one(self):\n \"\"\"Test basic case with expected covariance of 1.0\"\"\"\n df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6], \"C\": [7, 8, 9]})\n covariance_df, _ = f_826(df)\n self.assertTrue((covariance_df == 1).all().all())\n def test_identical_values_dataframe(self):\n \"\"\"Test DataFrame where all rows have identical values.\"\"\"\n df = pd.DataFrame({\"A\": [1, 1, 1], \"B\": [2, 2, 2]})\n covariance_df, _ = f_826(df)\n self.assertTrue((covariance_df == 0).all().all())\n def test_with_empty_dataframe(self):\n \"\"\"Test handling empty input (should raise error).\"\"\"\n df = pd.DataFrame()\n with self.assertRaises(ValueError):\n f_826(df)\n def test_with_non_numeric_dataframe(self):\n \"\"\"Test handling unsupported data types.\"\"\"\n df = pd.DataFrame({\"A\": [\"a\", \"b\", \"c\"], \"B\": [\"d\", \"e\", \"f\"]})\n with self.assertRaises(TypeError):\n f_826(df)\n def test_plot_attributes(self):\n \"\"\"Test plot attributes.\"\"\"\n df = pd.DataFrame({\"X\": [10, 20, 30], \"Y\": [15, 25, 35]})\n _, pair_plot = f_826(df)\n self.assertIsInstance(pair_plot, sns.axisgrid.PairGrid)\n self.assertEqual(len(pair_plot.axes), 2) # Should have 2x2 grid for pair plot\n def test_single_column_dataframe(self):\n \"\"\"Test handling of DataFrame with a single numeric column.\"\"\"\n df = pd.DataFrame({\"A\": [1, 2, 3]})\n covariance_df, _ = f_826(df)\n self.assertEqual(covariance_df.loc[\"A\"].item(), 1.0)\n self.assertEqual(covariance_df.shape, (1, 1))", "apis": ["numpy.number", "seaborn.pairplot", "numpy.issubdtype"], "libs": ["seaborn", "numpy"], "doc": {"description": ["Generates a pair plot from a numeric DataFrame and calculates its covariance matrix."], "note": [], "params": ["df (pd.DataFrame): A pandas DataFrame with only numeric columns."], "returns": ["tuple:", "covariance_df (pd.DataFrame): The covariance matrix of the input DataFrame.", "pair_plot (sns.axisgrid.PairGrid): Pair plot of the input DataFrame."], "reqs": ["pandas", "numpy", "seaborn"], "raises": ["ValueError: If the DataFrame is empty.", "TypeError: If the DataFrame contains non-numeric data types."], "example": ["Examples:", ">>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})", ">>> covariance_df, ax = f_826(df)", ">>> type(ax)", "", ">>> covariance_df", "A B C", "A 1.0 1.0 1.0", "B 1.0 1.0 1.0", "C 1.0 1.0 1.0"]}} +{"task_id": "f_382", "prompt": "import math\nimport numpy as np\nfrom datetime import datetime\nimport pandas as pd\n\n\ndef f_382(\n start_time,\n end_time,\n step,\n columns=[\"Timestamp\", \"Sensor1\", \"Sensor2\", \"Sensor3\", \"SensorStatus\"],\n sensor_statuses=[\"OK\", \"MAINTENANCE_REQUIRED\", \"ERROR\"],\n random_seed=42,\n):\n \"\"\"\n Generate a DataFrame with detailed artificial sensor readings for specified timestamps\n and sensor statuses from a predefined list.\n\n The function generates sensor readings for Sensor1, Sensor2, and Sensor3 (or their\n corresponding named columns in the supplied column list) using sine, cosine, and tan\n functions, respectively, of the timestamp (converted to seconds), with a small random\n noise added to simulate real sensor data variability.\n SensorStatus is randomly chosen from the provided statuses for each timestamp.\n\n Parameters:\n - start_time (int): Start time in milliseconds since epoch.\n - end_time (int): End time in milliseconds since epoch. Must not be before start_time.\n - step (int): The interval in milliseconds between each generated data point. Must be positive.\n This step defines the frequency at which data points are generated. If the step\n does not neatly divide the interval between start_time and end_time into\n equal-sized portions, the last timestamp may be excluded.\n - columns (list of str, optional): Names of the DataFrame columns to be included in the output.\n Defaults to: ['Timestamp', 'Sensor1', 'Sensor2', 'Sensor3', 'SensorStatus'].\n Regardless of naming, the function will populate the first column with\n timestamp, the middle columns with sensor data, and the final with status.\n - sensor_statuses (list of str, optional): Possible statuses for the sensors to randomly assign in the dataset.\n Defaults to: ['OK', 'MAINTENANCE_REQUIRED', 'ERROR'].\n - random_seed (int, optional): Seed for the random number generator to ensure reproducible results.\n Defaults to 42.\n\n Returns:\n - pd.DataFrame: Generated sensor readings for the given timestamps.\n\n Requirements:\n - math\n - datetime\n - numpy\n - pandas\n\n Example:\n >>> df = f_382(0, 5000, 1000)\n >>> type(df)\n \n >>> df.head(1)\n Timestamp Sensor1 Sensor2 Sensor3 SensorStatus\n 0 1970-01-01 00:00:00.000000 0.049671 0.986174 0.064769 ERROR\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n\n if start_time > end_time:\n raise ValueError(\"start_time cannot be after end_time\")\n if step < 0:\n raise ValueError(\"step must be positive\")\n\n timestamps = list(range(start_time, end_time, step))\n\n data = []\n for ts in timestamps:\n dt = datetime.utcfromtimestamp(ts / 1000).strftime(\"%Y-%m-%d %H:%M:%S.%f\")\n sensor1 = math.sin(ts / 1000) + np.random.normal(0, 0.1)\n sensor2 = math.cos(ts / 1000) + np.random.normal(0, 0.1)\n sensor3 = math.tan(ts / 1000) + np.random.normal(0, 0.1)\n status = np.random.choice(sensor_statuses)\n row = [dt, sensor1, sensor2, sensor3, status]\n data.append(row)\n\n return pd.DataFrame(data, columns=columns)", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic case\n df = f_382(0, 10000, 100, random_seed=42)\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(\n list(df.columns),\n [\"Timestamp\", \"Sensor1\", \"Sensor2\", \"Sensor3\", \"SensorStatus\"],\n )\n self.assertTrue(\n (df[\"SensorStatus\"].isin([\"OK\", \"MAINTENANCE_REQUIRED\", \"ERROR\"])).all()\n )\n def test_case_2(self):\n # Test custom columns\n columns = [\"Time\", \"Sensor_A\", \"Sensor_B\", \"Sensor_C\", \"Status\"]\n statuses = [\"WORKING\", \"NEEDS_CHECK\", \"FAILED\"]\n df = f_382(\n 1500, 3000, 50, columns=columns, sensor_statuses=statuses, random_seed=42\n )\n self.assertIsInstance(df, pd.DataFrame)\n self.assertEqual(list(df.columns), columns)\n self.assertTrue((df[\"Status\"].isin(statuses)).all())\n def test_case_3(self):\n # Test generated data integrity by comparing with expected results\n np.random.seed(42)\n ts = 0 # Using the starting timestamp for simplicity\n expected_sensor1 = math.sin(ts / 1000) + np.random.normal(0, 0.1, 1)[0]\n expected_sensor2 = math.cos(ts / 1000) + np.random.normal(0, 0.1, 1)[0]\n expected_sensor3 = math.tan(ts / 1000) + np.random.normal(0, 0.1, 1)[0]\n df = f_382(0, 100, 100, random_seed=42)\n self.assertAlmostEqual(df.iloc[0][\"Sensor1\"], expected_sensor1, places=5)\n self.assertAlmostEqual(df.iloc[0][\"Sensor2\"], expected_sensor2, places=5)\n self.assertAlmostEqual(df.iloc[0][\"Sensor3\"], expected_sensor3, places=5)\n def test_case_4(self):\n # Test handling invalid start times\n with self.assertRaises(ValueError):\n f_382(10000, 0, 100)\n def test_case_5(self):\n # Test handling incorrect end times\n with self.assertRaises(ValueError):\n f_382(1000, 900, 100)\n def test_case_6(self):\n # Test column handling\n columns = [\"Time\", \"Value1\", \"Value2\", \"Value3\", \"MachineStatus\"]\n df = f_382(0, 500, 100, columns=columns)\n self.assertEqual(list(df.columns), columns)\n # Too few/too many columns\n with self.assertRaises(ValueError):\n f_382(0, 500, 100, columns[:-1])\n with self.assertRaises(ValueError):\n f_382(0, 500, 100, columns + [\"foo\", \"bar\"])\n def test_case_7(self):\n # Test sensor status handling\n with self.assertRaises(ValueError):\n f_382(0, 500, 100, [])\n statuses = [\"RUNNING\", \"SHUTDOWN\", \"ERROR\"]\n df = f_382(0, 500, 100, sensor_statuses=statuses)\n self.assertTrue((df[\"SensorStatus\"].isin(statuses)).all())\n def test_case_8(self):\n # Test random seed\n df1 = f_382(0, 500, 100, random_seed=42)\n df2 = f_382(0, 500, 100, random_seed=42)\n pd.testing.assert_frame_equal(df1, df2)\n def test_case_9(self):\n # Test invalid steps handling\n with self.assertRaises(ValueError):\n f_382(0, 1000, -100) # Step is negative\n with self.assertRaises(ValueError):\n f_382(0, 1000, 0) # Step is zero", "apis": ["math.tan", "numpy.random", "pandas.DataFrame", "numpy.random.choice", "datetime.datetime.utcfromtimestamp", "math.sin", "math.cos", "numpy.random.normal", "numpy.random.seed"], "libs": ["pandas", "numpy", "math", "datetime"], "doc": {"description": ["Generate a DataFrame with detailed artificial sensor readings for specified timestamps", "and sensor statuses from a predefined list.", "The function generates sensor readings for Sensor1, Sensor2, and Sensor3 (or their", "corresponding named columns in the supplied column list) using sine, cosine, and tan", "functions, respectively, of the timestamp (converted to seconds), with a small random", "noise added to simulate real sensor data variability.", "SensorStatus is randomly chosen from the provided statuses for each timestamp."], "note": [], "params": ["start_time (int): Start time in milliseconds since epoch.", "end_time (int): End time in milliseconds since epoch. Must not be before start_time.", "step (int): The interval in milliseconds between each generated data point. Must be positive.", "This step defines the frequency at which data points are generated. If the step", "does not neatly divide the interval between start_time and end_time into", "equal-sized portions, the last timestamp may be excluded.", "columns (list of str, optional): Names of the DataFrame columns to be included in the output.", "Defaults to: ['Timestamp', 'Sensor1', 'Sensor2', 'Sensor3', 'SensorStatus'].", "Regardless of naming, the function will populate the first column with", "timestamp, the middle columns with sensor data, and the final with status.", "sensor_statuses (list of str, optional): Possible statuses for the sensors to randomly assign in the dataset.", "Defaults to: ['OK', 'MAINTENANCE_REQUIRED', 'ERROR'].", "random_seed (int, optional): Seed for the random number generator to ensure reproducible results.", "Defaults to 42."], "returns": ["pd.DataFrame: Generated sensor readings for the given timestamps."], "reqs": ["math", "datetime", "numpy", "pandas"], "raises": [], "example": [">>> df = f_382(0, 5000, 1000)", ">>> type(df)", "", ">>> df.head(1)", "Timestamp Sensor1 Sensor2 Sensor3 SensorStatus", "0 1970-01-01 00:00:00.000000 0.049671 0.986174 0.064769 ERROR"]}} +{"task_id": "f_762", "prompt": "import pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\ndef f_762(df):\n \"\"\"\n Draw and return a correlation matrix heatmap for a DataFrame containing numerical columns.\n The title of the heatmap is set to 'Correlation Matrix'.\n \n Parameters:\n df (pandas.DataFrame): The DataFrame containing numerical columns to be used for correlation.\n\n Returns:\n matplotlib.axes._subplots.Axes: The matplotlib Axes object representing the heatmap.\n\n Requirements:\n - pandas\n - seaborn\n - matplotlib.pyplot\n\n Example:\n >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n >>> ax = f_762(df)\n >>> type(ax)\n \n\n \"\"\"", "canonical_solution": " correlation_matrix = df.corr()\n ax = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')\n ax.set_title('Correlation Matrix')\n return ax", "test": "import unittest\nimport pandas as pd\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')\n \n def test_case_2(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [-4, -5, -6], 'c': [-7, -8, -9]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')\n \n def test_case_3(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [-7, -8, -9]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')\n \n def test_case_4(self):\n df = pd.DataFrame({'a': [1, 1, 1], 'b': [2, 2, 2], 'c': [3, 3, 3]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')\n \n def test_case_5(self):\n df = pd.DataFrame({'a': [1, 2, None], 'b': [4, None, 6], 'c': [None, 8, 9]})\n ax = f_762(df)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.title.get_text(), 'Correlation Matrix')", "apis": ["seaborn.heatmap"], "libs": ["seaborn"], "doc": {"description": ["Draw and return a correlation matrix heatmap for a DataFrame containing numerical columns.", "The title of the heatmap is set to 'Correlation Matrix'."], "note": [], "params": ["df (pandas.DataFrame): The DataFrame containing numerical columns to be used for correlation."], "returns": ["matplotlib.axes._subplots.Axes: The matplotlib Axes object representing the heatmap."], "reqs": ["pandas", "seaborn", "matplotlib.pyplot"], "raises": [], "example": [">>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})", ">>> ax = f_762(df)", ">>> type(ax)", ""]}} +{"task_id": "f_912", "prompt": "import warnings\nimport sqlite3\nimport pandas as pd\n\n\ndef f_912(db_path, query, warn_large_dataset=True):\n \"\"\"\n Fetches data from an SQLite database using the provided database path and SQL query.\n This function can optionally issue a warning when the dataset fetched contains more than 10,000 rows.\n\n Parameters:\n - db_path (str): The file path to the SQLite database from which data needs to be fetched.\n - query (str): The SQL query string used to retrieve data from the specified database.\n - warn_large_dataset (bool, optional): A boolean flag that, when set to True, triggers a \n warning if the retrieved dataset has more than 10,000 rows. Default is True.\n\n Returns:\n - pandas.DataFrame: A DataFrame containing the data fetched from the database.\n\n Requirements:\n - sqlite3\n - pandas\n - warnings\n\n Raises:\n - Exception: If any error occurs during database connection, SQL query execution, or data \n fetching. The error message provides details about the issue.\n\n Example:\n >>> data = f_912('/path/to/sqlite.db', 'SELECT * FROM table_name')\n >>> print(data)\n column1 column2\n 0 1 4\n 1 2 5\n 2 3 6\n \"\"\"", "canonical_solution": " if warn_large_dataset:\n warnings.simplefilter(\"always\")\n\n try:\n with sqlite3.connect(db_path) as conn:\n data = pd.read_sql_query(query, conn)\n\n if warn_large_dataset and data.shape[0] > 10000:\n warnings.warn(\"The data contains more than 10000 rows.\")\n\n return data\n\n except Exception as e:\n raise Exception(f\"Error fetching data from the database: {str(e)}\") from e", "test": "import unittest\nfrom unittest.mock import patch, MagicMock\nimport pandas as pd\nimport sqlite3\nimport warnings\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_912 function.\"\"\"\n def setUp(self):\n self.db_path = \"/path/to/sqlite.db\"\n self.query = \"SELECT * FROM table_name\"\n self.mock_data = pd.DataFrame({\"column1\": [1, 2, 3], \"column2\": [4, 5, 6]})\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_successful_query(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function for successful query execution.\n \"\"\"\n mock_connect.return_value.__enter__.return_value = MagicMock()\n mock_read_sql.return_value = self.mock_data\n result = f_912(self.db_path, self.query)\n print(result)\n mock_connect.assert_called_with(self.db_path)\n mock_read_sql.assert_called_with(\n self.query, mock_connect.return_value.__enter__.return_value\n )\n self.assertTrue(result.equals(self.mock_data))\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_large_dataset_warning(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function to check if it issues a warning for large datasets.\n \"\"\"\n large_data = pd.DataFrame({\"column1\": range(10001)})\n mock_read_sql.return_value = large_data\n with warnings.catch_warnings(record=True) as w:\n warnings.simplefilter(\"always\")\n f_912(self.db_path, self.query)\n self.assertEqual(len(w), 1)\n self.assertTrue(\"more than 10000 rows\" in str(w[-1].message))\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_no_warning_for_small_dataset(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function to ensure no warning for datasets smaller than 10000 rows.\n \"\"\"\n mock_read_sql.return_value = self.mock_data\n with warnings.catch_warnings(record=True) as w:\n warnings.simplefilter(\"always\")\n f_912(self.db_path, self.query)\n self.assertEqual(len(w), 0)\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_database_exception(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function to handle database connection exceptions.\n \"\"\"\n mock_connect.side_effect = sqlite3.OperationalError(\"Failed to connect\")\n with self.assertRaises(Exception) as context:\n f_912(self.db_path, self.query)\n self.assertIn(\"Error fetching data from the database\", str(context.exception))\n @patch(\"pandas.read_sql_query\")\n @patch(\"sqlite3.connect\")\n def test_sql_query_exception(self, mock_connect, mock_read_sql):\n \"\"\"\n Test f_912 function to handle SQL query execution exceptions.\n \"\"\"\n mock_read_sql.side_effect = pd.io.sql.DatabaseError(\"Failed to execute query\")\n with self.assertRaises(Exception) as context:\n f_912(self.db_path, self.query)\n self.assertIn(\"Error fetching data from the database\", str(context.exception))", "apis": ["warnings.warn", "warnings.simplefilter", "sqlite3.connect", "pandas.read_sql_query"], "libs": ["pandas", "warnings", "sqlite3"], "doc": {"description": ["Fetches data from an SQLite database using the provided database path and SQL query.", "This function can optionally issue a warning when the dataset fetched contains more than 10,000 rows."], "note": [], "params": ["db_path (str): The file path to the SQLite database from which data needs to be fetched.", "query (str): The SQL query string used to retrieve data from the specified database.", "warn_large_dataset (bool, optional): A boolean flag that, when set to True, triggers a", "warning if the retrieved dataset has more than 10,000 rows. Default is True."], "returns": ["pandas.DataFrame: A DataFrame containing the data fetched from the database."], "reqs": ["sqlite3", "pandas", "warnings"], "raises": ["Exception: If any error occurs during database connection, SQL query execution, or data", "fetching. The error message provides details about the issue."], "example": [">>> data = f_912('/path/to/sqlite.db', 'SELECT * FROM table_name')", ">>> print(data)", "column1 column2", "0 1 4", "1 2 5", "2 3 6"]}} +{"task_id": "f_929", "prompt": "import re\nfrom collections import Counter\nimport matplotlib.pyplot as plt\n\n\ndef f_929(text):\n \"\"\"\n Analyzes the frequency of words in a given text after lowercasing, removing punctuation, splitting into words,\n and plots the top 10 most common words.\n\n Parameters:\n - text (str): The input text to be analyzed.\n\n Returns:\n - list: A list of tuples containing the 10 most common words and their counts.\n - Axes: The matplotlib Axes object of the bar chart.\n\n Requirements:\n - re\n - collections.Counter\n - matplotlib.pyplot\n\n Example:\n >>> common_words, ax = f_929(\"This is a sample text. This text contains sample words like 'text', 'sample', and 'words'.\")\n >>> print(common_words)\n [('sample', 3), ('text', 3), ('this', 2), ('words', 2), ('is', 1), ('a', 1), ('contains', 1), ('like', 1), ('and', 1)]\n \"\"\"", "canonical_solution": " # Process text and count words\n cleaned_text = re.sub(f\"[{punctuation}]\", \"\", text).lower()\n words = cleaned_text.split()\n word_counts = Counter(words)\n most_common_words = word_counts.most_common(10)\n\n # Plotting\n _, ax = plt.subplots()\n if most_common_words: # Check if the list is not empty\n ax.bar(*zip(*most_common_words))\n else: # Handle empty case\n ax.bar([], [])\n\n return most_common_words, ax", "test": "import unittest\nfrom string import punctuation\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_929.\"\"\"\n def test_empty_text(self):\n \"\"\"\n Test the function with an empty string. Expect an empty list and a chart with no bars.\n \"\"\"\n common_words, _ = f_929(\"\")\n self.assertEqual(common_words, [])\n def test_single_word(self):\n \"\"\"\n Test the function with a text containing a single word repeated. Expect the word with its count.\n \"\"\"\n common_words, _ = f_929(\"test test test\")\n self.assertEqual(common_words, [(\"test\", 3)])\n def test_punctuation(self):\n \"\"\"\n Test the function with a text containing punctuations. Expect punctuations to be removed.\n \"\"\"\n common_words, _ = f_929(\"hello! hello, world.\")\n self.assertEqual(common_words, [(\"hello\", 2), (\"world\", 1)])\n def test_case_sensitivity(self):\n \"\"\"\n Test the function with a text containing the same word in different cases. Expect case insensitivity.\n \"\"\"\n common_words, _ = f_929(\"Hello hello HeLLo\")\n self.assertEqual(common_words, [(\"hello\", 3)])\n def test_common_scenario(self):\n \"\"\"\n Test the function with a standard sentence. Expect a correct count and ordering of words.\n \"\"\"\n text = \"This is a test. This is only a test.\"\n common_words, _ = f_929(text)\n expected = [(\"this\", 2), (\"is\", 2), (\"a\", 2), (\"test\", 2), (\"only\", 1)]\n self.assertEqual(common_words, expected)\n def tearDown(self):\n plt.close()", "apis": ["re.sub", "collections.Counter", "matplotlib.pyplot.subplots"], "libs": ["re", "collections", "matplotlib"], "doc": {"description": ["Analyzes the frequency of words in a given text after lowercasing, removing punctuation, splitting into words,", "and plots the top 10 most common words."], "note": [], "params": ["text (str): The input text to be analyzed."], "returns": ["list: A list of tuples containing the 10 most common words and their counts.", "Axes: The matplotlib Axes object of the bar chart."], "reqs": ["re", "collections.Counter", "matplotlib.pyplot"], "raises": [], "example": [">>> common_words, ax = f_929(\"This is a sample text. This text contains sample words like 'text', 'sample', and 'words'.\")", ">>> print(common_words)", "[('sample', 3), ('text', 3), ('this', 2), ('words', 2), ('is', 1), ('a', 1), ('contains', 1), ('like', 1), ('and', 1)]"]}} +{"task_id": "f_873", "prompt": "import itertools\nimport string\nimport pandas as pd\n\n\ndef f_873():\n \"\"\"\n Generate all possible combinations (with replacement) of three letters from the alphabet and save them in a pandas DataFrame.\n\n Parameters:\n - None\n\n Returns:\n - DataFrame: A pandas DataFrame with each row representing a unique combination of three letters.\n\n Requirements:\n - itertools\n - string\n - pandas\n\n Example:\n >>> df = f_873()\n >>> print(df.head())\n Letter 1 Letter 2 Letter 3\n 0 a a a\n 1 a a b\n 2 a a c\n 3 a a d\n 4 a a e\n \"\"\"", "canonical_solution": " LETTERS = list(string.ascii_lowercase)\n combinations = list(itertools.product(LETTERS, repeat=3))\n\n df = pd.DataFrame(combinations, columns=[\"Letter 1\", \"Letter 2\", \"Letter 3\"])\n\n return df", "test": "import unittest\nimport pandas as pd\nfrom itertools import product\nimport string\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_873.\"\"\"\n def test_combinations(self):\n \"\"\"\n Test if the function generates the correct combinations with replacement.\n \"\"\"\n correct_combinations = list(product(string.ascii_lowercase, repeat=3))\n result_df = f_873()\n result_combinations = [tuple(row) for row in result_df.values]\n self.assertEqual(\n result_combinations,\n correct_combinations,\n \"The combinations are not correct.\",\n )\n def test_columns(self):\n \"\"\"\n Test if the DataFrame has the correct column names.\n \"\"\"\n result_df = f_873()\n self.assertEqual(\n list(result_df.columns),\n [\"Letter 1\", \"Letter 2\", \"Letter 3\"],\n \"Column names are not correct.\",\n )\n def test_shape(self):\n \"\"\"\n Test if the shape of the DataFrame is correct.\n \"\"\"\n result_df = f_873()\n self.assertEqual(\n result_df.shape,\n (26**3, 3),\n \"Shape of the DataFrame is not correct.\",\n )\n def test_data_type(self):\n \"\"\"\n Test if all DataFrame columns contain strings.\n \"\"\"\n result_df = f_873()\n for col in result_df.columns:\n self.assertTrue(\n result_df[col].apply(lambda x: isinstance(x, str)).all(),\n f\"Column {col} does not contain all strings.\",\n )\n def test_no_duplicates(self):\n \"\"\"\n Test if there are no duplicate combinations in the DataFrame.\n \"\"\"\n result_df = f_873()\n result_combinations = [tuple(row) for row in result_df.values]\n self.assertEqual(\n len(result_combinations),\n len(set(result_combinations)),\n \"Found duplicate combinations.\",\n )", "apis": ["pandas.DataFrame", "itertools.product", "string.ascii_lowercase"], "libs": ["itertools", "pandas", "string"], "doc": {"description": ["Generate all possible combinations (with replacement) of three letters from the alphabet and save them in a pandas DataFrame."], "note": [], "params": ["None"], "returns": ["DataFrame: A pandas DataFrame with each row representing a unique combination of three letters."], "reqs": ["itertools", "string", "pandas"], "raises": [], "example": [">>> df = f_873()", ">>> print(df.head())", "Letter 1 Letter 2 Letter 3", "0 a a a", "1 a a b", "2 a a c", "3 a a d", "4 a a e"]}} +{"task_id": "f_423", "prompt": "import sqlite3\nimport pandas as pd\nimport seaborn as sns\n\n\ndef f_423(db_name=\"test.db\", table_name=\"People\"):\n \"\"\"\n Draw the age distribution of the persons in an SQLite3 table and returns the Axes object of the plot.\n Raises a ValueError if the loaded data contains negative age values.\n\n Parameters:\n db_name (str, optional): The full path to the SQLite3 database file. Defaults to 'test.db'.\n table_name (str, optional): The name of the table to plot from. Defaults to 'People'.\n\n Returns:\n matplotlib.axes._subplots.Axes: Axes object representing the age distribution plot,\n with x-axis showing age and a default of bins=30, kde=True.\n\n Requirements:\n - sqlite3\n - pandas\n - seaborn\n\n Examples:\n >>> ax = f_423('path/to/test.db', 'People')\n >>> type(ax)\n \n >>> ax = f_423()\n >>> type(ax)\n \n \"\"\"", "canonical_solution": " conn = sqlite3.connect(db_name)\n df = pd.read_sql_query(f\"SELECT age from {table_name}\", conn)\n\n if (df[\"age\"] < 0).any():\n raise ValueError(\"Data contains negative age values.\")\n\n ax = sns.histplot(data=df, x=\"age\", bins=30, kde=True)\n ax.set_xlabel(\"age\")\n return ax", "test": "import unittest\nimport os\nimport sqlite3\nimport matplotlib.pyplot as plt\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Setup temporary directory\n self.test_dir = tempfile.TemporaryDirectory()\n # Create test_alt.db with People table\n self.alt_db_path = os.path.join(self.test_dir.name, \"test_alt.db\")\n conn = sqlite3.connect(self.alt_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE People (name TEXT, age INT)\")\n cursor.executemany(\n \"INSERT INTO People VALUES (?, ?)\", [(\"Alice\", 25), (\"Bob\", 30)]\n )\n conn.commit()\n conn.close()\n # Create a standard test.db with Employees table\n self.default_db_path = os.path.join(self.test_dir.name, \"test.db\")\n conn = sqlite3.connect(self.default_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE Employees (name TEXT, age INT)\")\n cursor.executemany(\n \"INSERT INTO Employees VALUES (?, ?)\", [(\"Charlie\", 35), (\"David\", 40)]\n )\n conn.commit()\n conn.close()\n # Create standard db with more examples\n self.multiple_db_path = os.path.join(self.test_dir.name, \"test_multiple.db\")\n conn = sqlite3.connect(self.multiple_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE MultipleAge (name TEXT, age INT)\")\n cursor.executemany(\n \"INSERT INTO MultipleAge VALUES (?, ?)\",\n [(\"Alice\", 25), (\"Bob\", 30), (\"Charlie\", 35)],\n )\n conn.commit()\n conn.close()\n # Create a db for testing edge cases - negative age\n self.negative_age_db_path = os.path.join(\n self.test_dir.name, \"test_negative_age.db\"\n )\n conn = sqlite3.connect(self.negative_age_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE NegativeAge (name TEXT, age INT)\")\n cursor.executemany(\n \"INSERT INTO NegativeAge VALUES (?, ?)\", [(\"Eve\", -1), (\"Frank\", 20)]\n )\n conn.commit()\n conn.close()\n # Create a db for testing edge cases - empty\n self.empty_db_path = os.path.join(self.test_dir.name, \"test_empty.db\")\n conn = sqlite3.connect(self.empty_db_path)\n cursor = conn.cursor()\n cursor.execute(\"CREATE TABLE EmptyAge (name TEXT, age INT)\")\n conn.commit()\n conn.close()\n def tearDown(self):\n self.test_dir.cleanup()\n plt.close(\"all\")\n def _check_plot(self, ax, contains_data=True):\n self.assertTrue(isinstance(ax, plt.Axes), \"The plot should be an Axes object.\")\n self.assertEqual(ax.get_xlabel(), \"age\", \"The x-axis label should be 'age'.\")\n if contains_data:\n self.assertTrue(len(ax.lines) > 0, \"The plot should contain a KDE line.\")\n def test_case_1(self):\n ax = f_423(db_name=self.default_db_path, table_name=\"Employees\")\n self._check_plot(ax)\n def test_case_2(self):\n ax = f_423(db_name=self.alt_db_path)\n self._check_plot(ax)\n def test_case_3(self):\n ax = f_423(db_name=self.default_db_path, table_name=\"Employees\")\n self._check_plot(ax)\n def test_case_4(self):\n ax = f_423(db_name=self.multiple_db_path, table_name=\"MultipleAge\")\n self._check_plot(ax)\n def test_case_5(self):\n ax = f_423(db_name=self.empty_db_path, table_name=\"EmptyAge\")\n self._check_plot(ax, False)\n def test_case_6(self):\n # Test for non-existent table\n with self.assertRaises(Exception):\n f_423(db_name=self.default_db_path, table_name=\"Nonexistent\")\n def test_case_7(self):\n # Test for negative age values\n with self.assertRaises(ValueError):\n f_423(db_name=self.negative_age_db_path, table_name=\"NegativeAge\")", "apis": ["sqlite3.connect", "pandas.read_sql_query", "seaborn.histplot"], "libs": ["seaborn", "pandas", "sqlite3"], "doc": {"description": ["Draw the age distribution of the persons in an SQLite3 table and returns the Axes object of the plot.", "Raises a ValueError if the loaded data contains negative age values."], "note": [], "params": ["db_name (str, optional): The full path to the SQLite3 database file. Defaults to 'test.db'.", "table_name (str, optional): The name of the table to plot from. Defaults to 'People'."], "returns": ["matplotlib.axes._subplots.Axes: Axes object representing the age distribution plot,", "with x-axis showing age and a default of bins=30, kde=True."], "reqs": ["sqlite3", "pandas", "seaborn"], "raises": [], "example": ["Examples:", ">>> ax = f_423('path/to/test.db', 'People')", ">>> type(ax)", "", ">>> ax = f_423()", ">>> type(ax)", ""]}} +{"task_id": "f_766", "prompt": "import pandas as pd\nimport os\nimport sys\n\ndef f_766(file_path: str, column_name: str) -> pd.DataFrame:\n \"\"\"\n Load a CSV file into a Pandas DataFrame, replace all occurrences of the string '\\n' with the string '
'\n in the specified column, and return the cleaned DataFrame.\n \n Parameters:\n - file_path (str): The path to the CSV file to be read.\n - column_name (str): The name of the column in which to replace occurrences of '\\n' with '
'.\n \n Returns:\n - pd.DataFrame: The cleaned Pandas DataFrame.\n \n Requirements:\n - pandas\n - os\n - sys\n \n Examples:\n >>> df = f_766('data.csv', 'Value')\n >>> print(df['Value'].iloc[0])\n \"some
text\"\n >>> df = f_766('another_data.csv', 'Comments')\n >>> print(df['Comments'].iloc[1])\n \"hello
world\"\n \"\"\"", "canonical_solution": " if not os.path.exists(file_path):\n print(f'File does not exist: {file_path}')\n sys.exit(1)\n\n df = pd.read_csv(file_path)\n \n # Check if the column exists\n if column_name in df.columns:\n df[column_name] = df[column_name].replace({'\\n': '
'}, regex=True)\n else:\n print(f\"Column '{column_name}' does not exist in the DataFrame. No changes were made.\")\n\n return df", "test": "import unittest\nimport pandas as pd\nimport os\nclass TestCases(unittest.TestCase):\n def setUp(self):\n os.mkdir('test')\n data = {\n 'ID': [1, 2, 3],\n 'Value': [\"Hello\\nWorld\", \"Python\\nis\\nawesome\", \"No newlines here\"]\n }\n df = pd.DataFrame(data)\n df.to_csv('test/test_data_1.csv', index=False)\n data = {\n 'ID': [1, 2],\n 'Comments': [\"Good\\nMorning\", \"Happy\\nCoding\"]\n }\n df = pd.DataFrame(data)\n df.to_csv('test/test_data_2.csv', index=False)\n data = {\n 'ID': [1, 2],\n 'Text': [\"Line 1\", \"Line 2\\nLine 3\"]\n }\n df = pd.DataFrame(data)\n df.to_csv('test/test_data_3.csv', index=False)\n def tearDown(self):\n os.remove('test/test_data_1.csv')\n os.remove('test/test_data_2.csv')\n os.remove('test/test_data_3.csv')\n os.rmdir('test')\n def test_case_1(self):\n df = f_766('test/test_data_1.csv', 'Value')\n self.assertEqual(df['Value'].iloc[0], \"Hello
World\")\n self.assertEqual(df['Value'].iloc[1], \"Python
is
awesome\")\n self.assertEqual(df['Value'].iloc[2], \"No newlines here\")\n \n def test_case_2(self):\n df = f_766('test/test_data_2.csv', 'Comments')\n self.assertEqual(df['Comments'].iloc[0], \"Good
Morning\")\n self.assertEqual(df['Comments'].iloc[1], \"Happy
Coding\")\n \n def test_case_3(self):\n df = f_766('test/test_data_3.csv', 'Text')\n self.assertEqual(df['Text'].iloc[0], \"Line 1\")\n self.assertEqual(df['Text'].iloc[1], \"Line 2
Line 3\")\n \n def test_case_4(self):\n df1 = f_766('test/test_data_1.csv', 'Value')\n df2 = f_766('test/test_data_1.csv', '')\n self.assertEqual(df1['Value'].iloc[0], \"Hello
World\")\n self.assertEqual(df2['Value'].iloc[0], \"Hello\\nWorld\")\n \n def test_case_5(self):\n df1 = f_766('test/test_data_1.csv', 'Value')\n df2 = f_766('test/test_data_1.csv', 'NonExistentColumn')\n self.assertEqual(df1['Value'].iloc[0], \"Hello
World\")\n self.assertEqual(df2['Value'].iloc[0], \"Hello\\nWorld\")", "apis": ["pandas.read_csv", "sys.exit", "pandas.DataFrame", "os.path", "os.path.exists"], "libs": ["os", "pandas", "sys"], "doc": {"description": ["Load a CSV file into a Pandas DataFrame, replace all occurrences of the string '\\n' with the string '
'", "in the specified column, and return the cleaned DataFrame."], "note": [], "params": ["file_path (str): The path to the CSV file to be read.", "column_name (str): The name of the column in which to replace occurrences of '\\n' with '
'."], "returns": ["pd.DataFrame: The cleaned Pandas DataFrame."], "reqs": ["pandas", "os", "sys"], "raises": [], "example": ["Examples:", ">>> df = f_766('data.csv', 'Value')", ">>> print(df['Value'].iloc[0])", "\"some
text\"", ">>> df = f_766('another_data.csv', 'Comments')", ">>> print(df['Comments'].iloc[1])", "\"hello
world\""]}} +{"task_id": "f_386", "prompt": "from datetime import datetime\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_386(start_time, end_time, step, amplitude, period, seed=0):\n \"\"\"\n Generate a time series with a given seasonality from the start time to the end time\n with a given step, and plot the time series with the seasonality.\n\n Parameters:\n - start_time (int): The start epoch time in milliseconds.\n = end_time (int): The end epoch time in milliseconds.\n - step (int): The step in milliseconds between each data point. Must be at least 1.\n - amplitude (float): The amplitude of the seasonality.\n - period (int): The period of the seasonality in milliseconds. Must be at least 0.\n - seed (int): Random seed for reproducibility. Defaults to 0.\n\n Returns:\n plt.Axes: A plot of the generated 'Time Series with Seasonality',\n with 'Timestamp' on x-axis and 'Value' on y-axis.\n\n Requirements:\n - datetime.datetime\n - pandas\n - numpy\n\n Example:\n >>> ax = f_386(0, 10000, 100, 1, 1000)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(-20.0, 0, '1970-01-01 10:00:08.000000'), Text(0.0, 0, '1970-01-01 10:00:00.000000'), Text(20.0, 0, '1970-01-01 10:00:02.000000'), Text(40.0, 0, '1970-01-01 10:00:04.000000'), Text(60.0, 0, '1970-01-01 10:00:06.000000'), Text(80.0, 0, '1970-01-01 10:00:08.000000'), Text(100.0, 0, ''), Text(120.0, 0, '')]\n \"\"\"", "canonical_solution": " np.random.seed(seed)\n\n if period <= 0 or step < 1:\n raise ValueError(\"Invalid input values\")\n\n COLUMNS = [\"Timestamp\", \"Value\"]\n\n timestamps = np.arange(start_time, end_time, step)\n df = pd.DataFrame(columns=COLUMNS)\n\n if amplitude == 0:\n values = [0] * len(timestamps)\n else:\n values = np.random.normal(size=len(timestamps))\n\n data = []\n for i, ts in enumerate(timestamps):\n dt = datetime.fromtimestamp(ts / 1000).strftime(\"%Y-%m-%d %H:%M:%S.%f\")\n value = values[i] + amplitude * np.sin(2 * np.pi * ts / period)\n data.append([dt, value])\n\n df = pd.DataFrame(data, columns=COLUMNS)\n\n ax = df.plot(x=\"Timestamp\", y=\"Value\", title=\"Time Series with Seasonality\")\n ax.set_ylabel(\"Value\")\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Test basic properties\n test_cases = [\n (0, 10000, 100, 1, 1000),\n (0, 100000, 1000, 2, 5000),\n (0, 10000, 100, 0.5, 1000),\n (0, 10000, 100, 1, 500),\n (0, 10000, 500, 1, 1000),\n ]\n for start_time, end_time, step, amplitude, period in test_cases:\n with self.subTest(\n start_time=start_time,\n end_time=end_time,\n step=step,\n amplitude=amplitude,\n period=period,\n ):\n ax = f_386(start_time, end_time, step, amplitude, period)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Time Series with Seasonality\")\n self.assertEqual(ax.get_xlabel(), \"Timestamp\")\n self.assertEqual(ax.get_ylabel(), \"Value\")\n def test_case_2(self):\n # Test large step\n # Plot should still behave as expected even when step > (end_time - start_time)\n ax = f_386(0, 10000, 200000, 1, 1000)\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_title(), \"Time Series with Seasonality\")\n self.assertEqual(ax.get_xlabel(), \"Timestamp\")\n self.assertEqual(ax.get_ylabel(), \"Value\")\n def test_case_3(self):\n # Test handling invalid input types - period\n with self.assertRaises(ValueError):\n f_386(0, 10000, 100, 1, 0)\n with self.assertRaises(ValueError):\n f_386(0, 10000, 100, 1, -1)\n def test_case_4(self):\n # Test handling invalid input types - step\n with self.assertRaises(ValueError):\n f_386(0, 10000, -100, 1, 1000)\n with self.assertRaises(ValueError):\n f_386(0, 10000, 0, 1, 1000)\n def test_case_5(self):\n # Test plot data integrity\n ax = f_386(0, 10000, 100, 1, 1000)\n xy_data = ax.get_lines()[0].get_xydata()\n expected_length = (10000 - 0) // 100\n self.assertEqual(len(xy_data), expected_length)\n def test_case_6(self):\n # Test random seed\n ax1 = f_386(0, 10000, 100, 1, 1000, seed=42)\n xy_data1 = ax1.get_lines()[0].get_xydata()\n ax2 = f_386(0, 10000, 100, 1, 1000, seed=42)\n xy_data2 = ax2.get_lines()[0].get_xydata()\n ax3 = f_386(0, 10000, 100, 1, 1000, seed=43)\n xy_data3 = ax3.get_lines()[0].get_xydata()\n self.assertTrue(\n np.array_equal(xy_data1, xy_data2),\n \"Results should be the same with the same seed\",\n )\n self.assertFalse(\n np.array_equal(xy_data1, xy_data3),\n \"Results should be different with different seeds\",\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.sin", "numpy.arange", "pandas.DataFrame", "numpy.random", "datetime.datetime.fromtimestamp", "numpy.pi", "numpy.random.normal", "numpy.random.seed"], "libs": ["pandas", "numpy", "datetime"], "doc": {"description": ["Generate a time series with a given seasonality from the start time to the end time", "with a given step, and plot the time series with the seasonality."], "note": [], "params": ["start_time (int): The start epoch time in milliseconds.", "= end_time (int): The end epoch time in milliseconds.", "step (int): The step in milliseconds between each data point. Must be at least 1.", "amplitude (float): The amplitude of the seasonality.", "period (int): The period of the seasonality in milliseconds. Must be at least 0.", "seed (int): Random seed for reproducibility. Defaults to 0."], "returns": ["plt.Axes: A plot of the generated 'Time Series with Seasonality',", "with 'Timestamp' on x-axis and 'Value' on y-axis."], "reqs": ["datetime.datetime", "pandas", "numpy"], "raises": [], "example": [">>> ax = f_386(0, 10000, 100, 1, 1000)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(-20.0, 0, '1970-01-01 10:00:08.000000'), Text(0.0, 0, '1970-01-01 10:00:00.000000'), Text(20.0, 0, '1970-01-01 10:00:02.000000'), Text(40.0, 0, '1970-01-01 10:00:04.000000'), Text(60.0, 0, '1970-01-01 10:00:06.000000'), Text(80.0, 0, '1970-01-01 10:00:08.000000'), Text(100.0, 0, ''), Text(120.0, 0, '')]"]}} +{"task_id": "f_907", "prompt": "from matplotlib import pyplot as plt\nfrom sklearn.decomposition import PCA\n\n\ndef f_907(arr):\n \"\"\"\n Performs Principal Component Analysis (PCA) on the sum of rows of a 2D numpy array and plots the explained variance ratio.\n\n Note:\n - The title of the plot is set to \"Explained Variance Ratio of Principal Components\".\n\n Parameters:\n - arr (numpy.ndarray): A 2D numpy array. The input data for PCA.\n\n Returns:\n - ax (matplotlib.axes.Axes): An Axes object from matplotlib.\n\n Requirements:\n - scikit-learn\n - matplotlib\n\n Notes:\n - The function assumes that 'arr' is a valid 2D numpy array.\n - Only the first principal component is considered in this analysis.\n - The plot illustrates the proportion of the dataset's variance that lies along the axis of this first principal component.\n \n Example:\n >>> import numpy as np\n >>> arr = np.array([[i+j for i in range(3)] for j in range(5)])\n >>> axes = f_907(arr)\n >>> axes.get_title()\n 'Explained Variance Ratio of Principal Components'\n \"\"\"", "canonical_solution": " row_sums = arr.sum(axis=1)\n pca = PCA(n_components=1)\n pca.fit(row_sums.reshape(-1, 1))\n\n # Plotting (requires matplotlib and sklearn)\n\n _, ax = plt.subplots()\n ax.bar([0], pca.explained_variance_ratio_)\n ax.set_title(\"Explained Variance Ratio of Principal Components\")\n ax.set_xticks([0])\n ax.set_xticklabels([\"PC1\"])\n\n return ax", "test": "import unittest\nimport numpy as np\nfrom sklearn.decomposition import PCA\nfrom matplotlib import pyplot as plt\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for function f_907.\"\"\"\n def test_basic_functionality(self):\n \"\"\"Test basic functionality of f_907.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n result = f_907(arr)\n self.assertIsInstance(result, plt.Axes)\n def test_plot_title_verification(self):\n \"\"\"Test that the plot title is correct.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n result = f_907(arr)\n self.assertEqual(\n result.get_title(), \"Explained Variance Ratio of Principal Components\"\n )\n def test_bar_count_verification(self):\n \"\"\"Test that the number of bars is correct.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n result = f_907(arr)\n n_components = min(2, arr.sum(axis=1).reshape(-1, 1).shape[1])\n self.assertEqual(len(result.patches), n_components)\n def test_variance_ratios_verification(self):\n \"\"\"Test that the variance ratios are correct.\"\"\"\n arr = np.array([[i + j for i in range(3)] for j in range(5)])\n row_sums = arr.sum(axis=1)\n n_components = min(2, row_sums.reshape(-1, 1).shape[1])\n pca = PCA(n_components=n_components)\n pca.fit(row_sums.reshape(-1, 1))\n result = f_907(arr)\n for bar, variance_ratio in zip(result.patches, pca.explained_variance_ratio_):\n self.assertAlmostEqual(bar.get_height(), variance_ratio)\n def test_empty_input(self):\n \"\"\"Test that an empty input raises a ValueError.\"\"\"\n arr = np.array([])\n with self.assertRaises(ValueError):\n f_907(arr)", "apis": ["matplotlib.pyplot.subplots", "sklearn.decomposition.PCA"], "libs": ["sklearn", "matplotlib"], "doc": {"description": ["Performs Principal Component Analysis (PCA) on the sum of rows of a 2D numpy array and plots the explained variance ratio.", "Notes:", "- The function assumes that 'arr' is a valid 2D numpy array.", "- Only the first principal component is considered in this analysis.", "- The plot illustrates the proportion of the dataset's variance that lies along the axis of this first principal component."], "note": ["The title of the plot is set to \"Explained Variance Ratio of Principal Components\"."], "params": ["arr (numpy.ndarray): A 2D numpy array. The input data for PCA."], "returns": ["ax (matplotlib.axes.Axes): An Axes object from matplotlib."], "reqs": ["scikit-learn", "matplotlib"], "raises": [], "example": [">>> import numpy as np", ">>> arr = np.array([[i+j for i in range(3)] for j in range(5)])", ">>> axes = f_907(arr)", ">>> axes.get_title()", "'Explained Variance Ratio of Principal Components'"]}} +{"task_id": "f_815", "prompt": "import pathlib\nimport os\n\n\ndef f_815(path: str, delimiter: str = os.path.sep) -> list:\n \"\"\"\n Validates that a given file path does not contain invalid characters for file paths\n then splits it into path components using a specified delimiter.\n\n Parameters:\n - path (str): The file path to split. If empty, the function returns an empty list.\n - delimiter (str): The delimiter to use for splitting the path.\n Defaults to the system's path separator (os.path.sep).\n\n Returns:\n - list: A list of the path components if the path is valid;\n otherwise, an empty list if the path contains invalid characters.\n\n Raises:\n - ValueError: If the path contains invalid characters.\n\n Requirements:\n - pathlib\n - os\n\n Notes:\n - Backslashes ('\\\\') are internally converted to forward slashes ('/') before processing.\n - This function treats '<', '>', ':', '\"', '|', '?', '*' as invalid characters in paths.\n\n Examples:\n >>> f_815('Docs/src/Scripts/temp', '/')\n ['Docs', 'src', 'Scripts', 'temp']\n >>> f_815(r'Docs\\\\src\\\\Scripts\\\\temp', '\\\\\\\\')\n ['Docs', 'src', 'Scripts', 'temp']\n \"\"\"", "canonical_solution": "\n if not path:\n return []\n\n path = path.replace(\"\\\\\", \"/\")\n\n path_obj = pathlib.Path(path)\n\n invalid_chars = set('<>:\"|?*')\n if any(\n set(str(component)).intersection(invalid_chars) for component in path_obj.parts\n ):\n return []\n\n return [\n component\n for component in path_obj.parts\n if component and component != delimiter\n ]", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n # Testing a standard UNIX-like path with '/' delimiter\n self.assertEqual(\n f_815(\"Docs/src/Scripts/temp\", \"/\"),\n [\"Docs\", \"src\", \"Scripts\", \"temp\"],\n )\n def test_case_2(self):\n # Testing a standard Windows-like path with '\\' delimiter\n self.assertEqual(\n f_815(\"Docs\\\\src\\\\Scripts\\\\temp\", \"\\\\\"),\n [\"Docs\", \"src\", \"Scripts\", \"temp\"],\n )\n def test_case_3(self):\n # Testing an empty path string\n self.assertEqual(f_815(\"\", \"/\"), [])\n def test_case_4(self):\n # Testing a path with invalid characters\n self.assertEqual(f_815(\"Docs/src/Scripts|temp\", \"/\"), [])\n def test_case_5(self):\n # Testing a path with a different delimiter\n self.assertEqual(f_815(\"Docs|src|Scripts|temp\", \"|\"), [])\n def test_case_6(self):\n # Handle leading and trailing delimiters\n self.assertEqual(f_815(\"/Docs/src/Scripts/\", \"/\"), [\"Docs\", \"src\", \"Scripts\"])\n def test_case_7(self):\n # Test mixed delimiters given expected conversion\n self.assertEqual(\n f_815(\"Docs/src\\\\Scripts/temp\", \"\\\\\"), [\"Docs\", \"src\", \"Scripts\", \"temp\"]\n )\n self.assertEqual(\n f_815(\"Docs/src\\\\Scripts/temp\", \"/\"), [\"Docs\", \"src\", \"Scripts\", \"temp\"]\n )", "apis": ["os.path", "pathlib.Path"], "libs": ["os", "pathlib"], "doc": {"description": ["Validates that a given file path does not contain invalid characters for file paths", "then splits it into path components using a specified delimiter.", "Notes:", "- Backslashes ('\\\\') are internally converted to forward slashes ('/') before processing.", "- This function treats '<', '>', ':', '\"', '|', '?', '*' as invalid characters in paths."], "note": [], "params": ["path (str): The file path to split. If empty, the function returns an empty list.", "delimiter (str): The delimiter to use for splitting the path.", "Defaults to the system's path separator (os.path.sep)."], "returns": ["list: A list of the path components if the path is valid;", "otherwise, an empty list if the path contains invalid characters."], "reqs": ["pathlib", "os"], "raises": ["ValueError: If the path contains invalid characters."], "example": ["Examples:", ">>> f_815('Docs/src/Scripts/temp', '/')", "['Docs', 'src', 'Scripts', 'temp']", ">>> f_815(r'Docs\\\\src\\\\Scripts\\\\temp', '\\\\\\\\')", "['Docs', 'src', 'Scripts', 'temp']"]}} +{"task_id": "f_876", "prompt": "import itertools\nimport string\nimport pandas as pd\n\n\ndef f_876():\n \"\"\"\n Generate all possible 3-letter combinations of the alphabet, save them in a pandas DataFrame,\n and draw a histogram of the frequency of the first letters in these combinations.\n\n This function uses itertools.product to create all possible combinations of three letters.\n It then creates a DataFrame from these combinations and plots a histogram to show the frequency\n of each letter appearing as the first letter in these combinations.\n\n Parameters:\n - None\n\n Returns:\n tuple: A tuple containing:\n - DataFrame: A pandas DataFrame with all 3-letter combinations.\n - Axes: A matplotlib Axes object representing the histogram plot.\n\n Requirements:\n - itertools\n - string\n - pandas\n\n Example:\n >>> df, ax = f_876()\n >>> print(df.head())\n a b c\n 0 a a a\n 1 a a b\n 2 a a c\n 3 a a d\n 4 a a e\n \"\"\"", "canonical_solution": " LETTERS = list(string.ascii_lowercase)\n combinations = list(itertools.product(LETTERS, repeat=3))\n df = pd.DataFrame(combinations, columns=[\"a\", \"b\", \"c\"])\n\n # Getting value counts and ensuring the correct order of letters\n value_counts = df[\"a\"].value_counts().reindex(LETTERS, fill_value=0)\n\n # Plotting the histogram with the correct order\n ax = value_counts.plot(kind=\"bar\")\n\n return df, ax", "test": "import unittest\nimport itertools\nimport string\nimport matplotlib.pyplot as plt\nLETTERS = list(string.ascii_lowercase)\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_876\"\"\"\n def test_dataframe_shape(self):\n \"\"\"\n Test if the DataFrame has the correct shape (17576 rows, 3 columns)\n \"\"\"\n df, _ = f_876()\n self.assertEqual(df.shape, (17576, 3))\n def test_dataframe_columns(self):\n \"\"\"\n Test if the DataFrame has the correct column names (a, b, c)\n \"\"\"\n df, _ = f_876()\n self.assertListEqual(list(df.columns), [\"a\", \"b\", \"c\"])\n def test_histogram_plot(self):\n \"\"\"\n Test if the histogram plot is an instance of matplotlib Axes\n \"\"\"\n _, ax = f_876()\n self.assertTrue(isinstance(ax, plt.Axes))\n def test_first_column_values(self):\n \"\"\"\n Test if the first column of the DataFrame contains only lowercase letters\n \"\"\"\n df, _ = f_876()\n self.assertTrue(all(letter in string.ascii_lowercase for letter in df[\"a\"]))\n def test_no_empty_values(self):\n \"\"\"\n Test if there are no empty values in the DataFrame\n \"\"\"\n df, _ = f_876()\n self.assertFalse(df.isnull().values.any())\n def tearDown(self):\n plt.close()", "apis": ["pandas.DataFrame", "itertools.product", "string.ascii_lowercase"], "libs": ["itertools", "pandas", "string"], "doc": {"description": ["Generate all possible 3-letter combinations of the alphabet, save them in a pandas DataFrame,", "and draw a histogram of the frequency of the first letters in these combinations.", "This function uses itertools.product to create all possible combinations of three letters.", "It then creates a DataFrame from these combinations and plots a histogram to show the frequency", "of each letter appearing as the first letter in these combinations."], "note": [], "params": ["None"], "returns": ["tuple: A tuple containing:", "DataFrame: A pandas DataFrame with all 3-letter combinations.", "Axes: A matplotlib Axes object representing the histogram plot."], "reqs": ["itertools", "string", "pandas"], "raises": [], "example": [">>> df, ax = f_876()", ">>> print(df.head())", "a b c", "0 a a a", "1 a a b", "2 a a c", "3 a a d", "4 a a e"]}} +{"task_id": "f_586", "prompt": "import pandas as pd\nfrom sklearn.linear_model import LinearRegression\n\ndef f_586(df, target):\n \"\"\"\n Perform a linear regression analysis on a given DataFrame.\n \n Parameters:\n - df (pd.DataFrame): The pandas DataFrame.\n - target (str): The target variable.\n \n Returns:\n - score (float): The R-squared score of the model.\n\n Requirements:\n - pandas\n - sklearn\n\n Example:\n >>> import numpy as np\n >>> np.random.seed(42)\n >>> df = pd.DataFrame({'feature': np.random.rand(100), 'target': np.random.rand(100)}) # Explicitly using pd\n >>> r_squared = f_586(df, 'target')\n >>> print(r_squared)\n 0.0011582111228732872\n \"\"\"", "canonical_solution": " X = pd.DataFrame.drop(df, target, axis=1) \n y = pd.Series(df[target]) \n \n model = LinearRegression()\n model.fit(X, y)\n\n return model.score(X, y)", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame([[0, 1, 2], [3, 4, 5], [6, 7, 8]], columns = ['x', 'y', 'z'])\n r_squared = f_586(df, 'z')\n self.assertEqual(r_squared, 1.0)\n \n def test_case_2(self):\n df = pd.DataFrame([[-1, 1, 2], [3, 4, 5], [6, 7, 8]], columns = ['x', 'y', 'z'])\n r_squared = f_586(df, 'z')\n self.assertEqual(r_squared, 1.0)\n \n def test_case_3(self):\n df = pd.DataFrame([[0, 0, 0], [1, 1, 1], [2, 2, 2]], columns = ['x', 'y', 'z'])\n r_squared = f_586(df, 'z')\n self.assertEqual(r_squared, 1.0)\n def test_case_4(self):\n df = pd.DataFrame([[0, 0, 9], [1, 1, 35], [2, 2, 78]], columns = ['x', 'y', 'z'])\n r_squared = f_586(df, 'z')\n self.assertFalse(r_squared == 1.0)\n def test_case_5(self):\n df = pd.DataFrame([[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], columns = ['x', 'y', 'z', 'w'])\n r_squared = f_586(df, 'w')\n self.assertEqual(r_squared, 1.0)", "apis": ["pandas.DataFrame", "sklearn.linear_model.LinearRegression", "pandas.DataFrame.drop", "pandas.Series"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Perform a linear regression analysis on a given DataFrame."], "note": [], "params": ["df (pd.DataFrame): The pandas DataFrame.", "target (str): The target variable."], "returns": ["score (float): The R-squared score of the model."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> import numpy as np", ">>> np.random.seed(42)", ">>> df = pd.DataFrame({'feature': np.random.rand(100), 'target': np.random.rand(100)}) # Explicitly using pd", ">>> r_squared = f_586(df, 'target')", ">>> print(r_squared)", "0.0011582111228732872"]}} {"task_id": "f_593", "prompt": "import pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\n\ndef f_593(df, target_column):\n \"\"\"\n Perform a logistic regression on a DataFrame to predict a specific target column.\n\n Parameters:\n - df (pd.DataFrame): The input DataFrame.\n - target_column (str): The target column name.\n\n Returns:\n - accuracy (float): The accuracy of the logistic regression model.\n\n Example:\n >>> np.random.seed(42)\n >>> data = np.random.randint(0, 100, size=(100, 4)) # Using np to generate random data\n >>> columns = ['A', 'B', 'C', 'target']\n >>> df = pd.DataFrame(data, columns=columns) # Explicitly using pd to create DataFrame\n >>> f_593(df, 'target')\n 0.0\n \"\"\"", "canonical_solution": " if target_column not in df.columns:\n raise ValueError('Target column does not exist in DataFrame')\n\n X = df.drop(columns=target_column) # Operate directly on the DataFrame\n y = df[target_column]\n\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\n model = LogisticRegression(max_iter=200)\n model.fit(X_train, y_train)\n\n y_pred = model.predict(X_test)\n accuracy = accuracy_score(y_test, y_pred)\n\n return accuracy", "test": "import unittest\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [0, 1, 0]})\n self.assertEqual(f_593(df, 'C'), 0.0)\n def test_case_2(self):\n df = pd.DataFrame({'A': [1, 2, 3, -10], 'B': [4, 5, 6, -10], 'C': [1, 1, 1, 0]})\n self.assertEqual(f_593(df, 'C'), 1.0)\n def test_case_3(self):\n df = pd.DataFrame({'A': [1, 2, 3, -10], 'B': [4, 5, 6, -10], 'C': [0, 0, 0, 1]})\n self.assertEqual(f_593(df, 'C'), 1.0)\n def test_case_4(self):\n df = pd.DataFrame({'A': [-10, 2, 3, -10], 'B': [-10, 5, 6, -10], 'C': [1, 0, 0, 1]})\n self.assertEqual(f_593(df, 'C'), 1.0)\n def test_case_5(self):\n df = pd.DataFrame({'A': [-10, 2, 3, -10], 'B': [-10, 5, 6, -10], 'C': [0, 1, 1, 0]})\n self.assertEqual(f_593(df, 'C'), 1.0)", "apis": ["sklearn.model_selection.train_test_split", "sklearn.metrics.accuracy_score", "sklearn.linear_model.LogisticRegression"], "libs": ["sklearn"], "doc": {"description": ["Perform a logistic regression on a DataFrame to predict a specific target column."], "note": [], "params": ["df (pd.DataFrame): The input DataFrame.", "target_column (str): The target column name."], "returns": ["accuracy (float): The accuracy of the logistic regression model."], "reqs": [], "raises": [], "example": [">>> np.random.seed(42)", ">>> data = np.random.randint(0, 100, size=(100, 4)) # Using np to generate random data", ">>> columns = ['A', 'B', 'C', 'target']", ">>> df = pd.DataFrame(data, columns=columns) # Explicitly using pd to create DataFrame", ">>> f_593(df, 'target')", "0.0"]}} -{"task_id": "f_361", "prompt": "import subprocess\nimport os\nimport time\nfrom datetime import datetime\n\n\ndef f_361(script_dir, scripts, delay):\n \"\"\"\n Execute a list of bash scripts with a specified delay between each script.\n\n Parameters:\n script_dir (str): Path to the directory containing the scripts.\n scripts (list): List of script filenames to be executed. Must not be empty.\n If a script is not found, the function raises a FileNotFoundError.\n delay (int): The delay in seconds between each script execution. Must at least 0.\n\n Returns:\n list: A list of timestamps indicating the start time of each script execution.\n\n Requirements:\n - subprocess\n - os\n - time\n - datetime.datetime\n\n Example:\n >>> f_361('/path/to/scripts/', ['script1.sh', 'script2.sh'], 5)\n ['2023-09-09 10:10:10', '2023-09-09 10:10:15']\n \"\"\"", "canonical_solution": " if delay < 0:\n raise ValueError(\"delay cannot be negative.\")\n if not scripts:\n raise ValueError(\"No scripts provided.\")\n start_times = []\n for script in scripts:\n script_path = os.path.join(script_dir, script)\n start_time = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n start_times.append(start_time)\n\n result = subprocess.call(script_path, shell=True)\n if result != 0:\n raise FileNotFoundError(f\"Script not found: {script_path}\")\n\n time.sleep(delay)\n return start_times", "test": "import unittest\nimport tempfile\nimport os\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Create a temporary directory to store scripts\n self.temp_dir = tempfile.TemporaryDirectory()\n self.script_dir = self.temp_dir.name\n def tearDown(self):\n # Clean up the temporary directory\n self.temp_dir.cleanup()\n def create_temp_script(self, script_content):\n # Helper function to create a temporary script file with the given content\n fd, path = tempfile.mkstemp(dir=self.script_dir, suffix=\".sh\")\n with os.fdopen(fd, \"w\") as f:\n f.write(\"#!/bin/bash\\n\")\n f.write(script_content)\n os.chmod(path, 0o755)\n return os.path.basename(path)\n def test_case_1(self):\n # Testing with a single script and delay of 1 second\n script_name = self.create_temp_script(\"echo 'Test'\")\n scripts = [script_name]\n delay = 1\n start_times = f_361(self.script_dir, scripts, delay)\n self.assertEqual(len(start_times), 1)\n self.assertTrue(\n isinstance(datetime.strptime(start_times[0], \"%Y-%m-%d %H:%M:%S\"), datetime)\n )\n def test_case_2(self):\n # Testing with multiple scripts and a longer delay\n script_names = [\n self.create_temp_script(\"echo 'Test'\"),\n self.create_temp_script(\"echo 'Test 2'\"),\n ]\n delay = 2\n start_times = f_361(self.script_dir, script_names, delay)\n self.assertEqual(len(start_times), 2)\n time_diff = datetime.strptime(\n start_times[1], \"%Y-%m-%d %H:%M:%S\"\n ) - datetime.strptime(start_times[0], \"%Y-%m-%d %H:%M:%S\")\n self.assertEqual(time_diff.seconds, delay)\n def test_case_3(self):\n # Testing with an invalid script path\n with self.assertRaises(FileNotFoundError):\n f_361(self.script_dir, [\"this-doesn't-exist\"], 1)\n def test_case_4(self):\n # Testing with no scripts (empty list)\n with self.assertRaises(Exception):\n f_361(self.script_dir, [], 1)\n def test_case_5(self):\n # Testing with zero delay\n script_names = [\n self.create_temp_script(\"echo 'Test'\"),\n self.create_temp_script(\"echo 'Test 2'\"),\n ]\n delay = 0\n start_times = f_361(self.script_dir, script_names, delay)\n self.assertEqual(len(start_times), 2)\n def test_case_6(self):\n # Test handling invalid delay\n script_names = [\n self.create_temp_script(\"echo 'Test'\"),\n self.create_temp_script(\"echo 'Test 2'\"),\n ]\n with self.assertRaises(Exception):\n f_361(self.script_dir, script_names, -1)", "apis": ["subprocess.call", "os.path", "datetime.datetime.now", "time.sleep", "os.path.join"], "libs": ["subprocess", "datetime", "time", "os"], "doc": {"description": ["Execute a list of bash scripts with a specified delay between each script."], "note": [], "params": ["script_dir (str): Path to the directory containing the scripts.", "scripts (list): List of script filenames to be executed. Must not be empty.", "If a script is not found, the function raises a FileNotFoundError.", "delay (int): The delay in seconds between each script execution. Must at least 0."], "returns": ["list: A list of timestamps indicating the start time of each script execution."], "reqs": ["subprocess", "os", "time", "datetime.datetime"], "raises": [], "example": [">>> f_361('/path/to/scripts/', ['script1.sh', 'script2.sh'], 5)", "['2023-09-09 10:10:10', '2023-09-09 10:10:15']"]}} -{"task_id": "f_754", "prompt": "from collections import Counter\nimport itertools\n\ndef f_754(letters: list, repetitions: int) -> dict:\n \"\"\"\n Count the frequency of each letter in a list after repeating it a given number of times.\n\n Input:\n - letters (list): A list of single-character strings representing letters.\n - repetitions (int): The number of times to repeat the list.\n\n Output:\n Returns a dictionary where the keys are the letters and the values are their frequencies.\n\n Requirements:\n - collections.Counter\n - itertools\n\n Example:\n >>> f_754(['A', 'B', 'C'], 2)\n {'A': 2, 'B': 2, 'C': 2}\n >>> f_754(['A', 'B'], 3)\n {'A': 3, 'B': 3}\n \"\"\"", "canonical_solution": " # Create a flattened list by repeating the original list\n flattened_list = list(itertools.chain(*[letters for _ in range(repetitions)]))\n \n # Count the occurrences of each letter in the flattened list\n counts = dict(Counter(flattened_list))\n \n return counts", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n result = f_754(['A', 'B', 'C'], 2)\n expected = {'A': 2, 'B': 2, 'C': 2}\n self.assertEqual(result, expected)\n \n def test_case_2(self):\n result = f_754(['A', 'B'], 3)\n expected = {'A': 3, 'B': 3}\n self.assertEqual(result, expected)\n \n def test_case_3(self):\n result = f_754([], 2)\n expected = {}\n self.assertEqual(result, expected)\n \n def test_case_4(self):\n result = f_754(['A', 'B', 'A'], 2)\n expected = {'A': 4, 'B': 2}\n self.assertEqual(result, expected)\n \n def test_case_5(self):\n result = f_754(['A'], 0)\n expected = {}\n self.assertEqual(result, expected)", "apis": ["collections.Counter", "itertools.chain"], "libs": ["collections", "itertools"], "doc": {"description": ["Count the frequency of each letter in a list after repeating it a given number of times.", "Input:", "- letters (list): A list of single-character strings representing letters.", "- repetitions (int): The number of times to repeat the list.", "Output:", "Returns a dictionary where the keys are the letters and the values are their frequencies."], "note": [], "params": [], "returns": [], "reqs": ["collections.Counter", "itertools"], "raises": [], "example": [">>> f_754(['A', 'B', 'C'], 2)", "{'A': 2, 'B': 2, 'C': 2}", ">>> f_754(['A', 'B'], 3)", "{'A': 3, 'B': 3}"]}} -{"task_id": "f_884", "prompt": "import socket\nimport select\nimport queue\nfrom datetime import datetime, timedelta\n\n\ndef f_884(\n server_address=\"localhost\", server_port=12345, buffer_size=1024, run_duration=5\n):\n \"\"\"\n Run a non-blocking echo server that appends the server's current time to received data and sends it back to the client, while handling exceptional conditions for each socket.\n\n Parameters:\n - server_address (str): The address for the server to listen on. Default is 'localhost'.\n - server_port (int): The port for the server to listen on. Default is 12345.\n - buffer_size (int): The buffer size for data reception. Default is 1024 bytes.\n - run_duration (int): The duration (in seconds) for which the server will run. Default is 5 seconds.\n\n Returns:\n - str: A status message indicating the server's operation and run duration.\n\n Requirements:\n - socket\n - select\n - queue\n - datetime\n\n Example:\n >>> print(f_884())\n 'Server started on localhost:12345. Ran for 5 seconds.'\n \"\"\"", "canonical_solution": " server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n server.setblocking(0)\n server.bind((server_address, server_port))\n server.listen(5)\n inputs = [server]\n outputs = []\n message_queues = {}\n end_time = datetime.now() + timedelta(seconds=run_duration)\n\n try:\n while datetime.now() < end_time:\n readable, writable, _ = select.select(inputs, outputs, inputs, 1)\n for s in readable:\n if s is server:\n connection, _ = s.accept()\n connection.setblocking(0)\n inputs.append(connection)\n message_queues[connection] = queue.Queue()\n else:\n data = s.recv(buffer_size)\n if data:\n message_queues[s].put(f\"{datetime.now()}: {data.decode()}\")\n if s not in outputs:\n outputs.append(s)\n else:\n if s in outputs:\n outputs.remove(s)\n inputs.remove(s)\n s.close()\n del message_queues[s]\n\n for s in writable:\n if s not in message_queues:\n continue # Skip if socket's queue has been removed\n\n try:\n next_msg = message_queues[s].get_nowait()\n except queue.Empty:\n outputs.remove(s)\n else:\n s.sendall(next_msg.encode(\"utf-8\"))\n\n finally:\n server.close()\n\n return f\"Server started on {server_address}:{server_port}. Ran for {run_duration} seconds.\"", "test": "import unittest\nimport socket\nimport time\nimport threading\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_884 function.\"\"\"\n def setUp(self):\n # Start the server in a separate thread\n self.server_thread = threading.Thread(\n target=f_884, args=(\"localhost\", 12345, 1024, 10)\n )\n self.server_thread.start()\n time.sleep(1)\n def tearDown(self):\n # Ensure the server thread is closed after each test\n self.server_thread.join()\n def test_queue_empty_condition(self):\n \"\"\"Test if the server correctly handles an empty queue condition.\"\"\"\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n # Send a message and then close the socket immediately\n client.sendall(\"Hello\".encode())\n client.close()\n # The server should handle the empty queue condition without crashing\n # Wait briefly to allow server to process the situation\n time.sleep(1)\n # Since the server should continue running and not crash,\n # we can attempt a new connection to check server's state\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as new_client:\n new_client.connect((\"localhost\", 12345))\n test_message = \"Test after empty queue\"\n new_client.sendall(test_message.encode())\n response = new_client.recv(1024).decode()\n self.assertIn(test_message, response)\n def test_server_response(self):\n \"\"\"Test if server correctly echoes received data with server time.\"\"\"\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n test_message = \"Hello, Server!\"\n client.sendall(test_message.encode())\n response = client.recv(1024).decode()\n self.assertIn(test_message, response)\n def test_multiple_connections(self):\n \"\"\"Test the server's ability to handle multiple client connections.\"\"\"\n responses = []\n for _ in range(5):\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n client.sendall(\"Test\".encode())\n responses.append(client.recv(1024).decode())\n for response in responses:\n # Assuming the server response format includes the timestamp followed by the echoed message\n self.assertTrue(\"Test\" in response)\n def test_no_data_received(self):\n \"\"\"Test server behavior when no data is received from the client.\"\"\"\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n # Not sending any data\n client.settimeout(2)\n with self.assertRaises(socket.timeout):\n client.recv(1024)\n def test_server_closes_after_duration(self):\n \"\"\"Test if the server closes after the specified duration.\"\"\"\n # Wait for a duration longer than the server's run time\n time.sleep(5)\n with self.assertRaises((socket.timeout, ConnectionRefusedError)):\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.settimeout(2)\n client.connect((\"localhost\", 12345))\n client.recv(1024)\n def test_large_data_transfer(self):\n \"\"\"Test the server's ability to handle a large data transfer.\"\"\"\n large_data = \"A\" * 1000\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n client.sendall(large_data.encode())\n # Initialize an empty string to accumulate the response\n total_response = \"\"\n while True:\n # Receive data in chunks\n part = client.recv(1024).decode()\n total_response += part\n # Check if the end of the message is reached\n if large_data in total_response:\n break\n # Assert that the large data string is in the response\n self.assertIn(large_data, total_response)", "apis": ["select.select", "queue.Empty", "socket.AF_INET", "datetime.datetime.now", "datetime.timedelta", "socket.socket", "socket.SOCK_STREAM", "queue.Queue"], "libs": ["queue", "socket", "datetime", "select"], "doc": {"description": ["Run a non-blocking echo server that appends the server's current time to received data and sends it back to the client, while handling exceptional conditions for each socket."], "note": [], "params": ["server_address (str): The address for the server to listen on. Default is 'localhost'.", "server_port (int): The port for the server to listen on. Default is 12345.", "buffer_size (int): The buffer size for data reception. Default is 1024 bytes.", "run_duration (int): The duration (in seconds) for which the server will run. Default is 5 seconds."], "returns": ["str: A status message indicating the server's operation and run duration."], "reqs": ["socket", "select", "queue", "datetime"], "raises": [], "example": [">>> print(f_884())", "'Server started on localhost:12345. Ran for 5 seconds.'"]}} -{"task_id": "f_833", "prompt": "import binascii\nimport base64\nimport urllib.parse\nimport codecs\n\n\ndef f_833(hex_string):\n \"\"\"\n Convert a hexadecimal string to various encodings.\n\n This function takes a hexadecimal string as input and performs several encoding operations. \n Initially, it decodes the hexadecimal string to bytes and then converts these bytes into a UTF-8 string. \n This UTF-8 string is subsequently encoded into different formats: hexadecimal, base64, UTF-8, UTF-16, \n UTF-32, ASCII (if possible), URL encoding, and ROT13. Note that if ASCII not possible, returns 'Not representable in ASCII'.\n\n Parameters:\n - hex_string (str): The input string in hexadecimal format.\n\n Returns:\n - dict: A dictionary containing the input string encoded in various formats. The dictionary's keys\n are the encoding types ('hex', 'base64', 'utf-8', 'utf-16', 'utf-32', 'ASCII', 'URL', 'ROT13'),\n and the values are the corresponding encoded strings. If the string cannot be represented in ASCII,\n the 'ASCII' key maps to 'Not representable in ASCII'.\n\n Requirements:\n - binascii\n - base64\n - urllib\n - codecs\n\n Example:\n >>> f_833(\"4a4b4c\")\n {'hex': '4a4b4c', 'base64': 'SktM', 'utf-8': 'JKL', 'utf-16': 'JKL', 'utf-32': 'JKL', 'ASCII': 'JKL', 'URL': 'JKL', 'ROT13': 'WXY'}\n\n >>> f_833(\"68656c6c6f\")\n {'hex': '68656c6c6f', 'base64': 'aGVsbG8=', 'utf-8': 'hello', 'utf-16': 'hello', 'utf-32': 'hello', 'ASCII': 'hello', 'URL': 'hello', 'ROT13': 'uryyb'}\n \"\"\"", "canonical_solution": " encodings = {}\n\n # Convert hex string to its string representation\n decoded_str = bytes.fromhex(hex_string).decode(\"utf-8\")\n\n # Hexadecimal encoding\n encodings[\"hex\"] = binascii.hexlify(decoded_str.encode()).decode()\n\n # Base64 encoding\n encodings[\"base64\"] = base64.b64encode(decoded_str.encode()).decode()\n\n # UTF-8 encoding\n encodings[\"utf-8\"] = decoded_str.encode(\"utf-8\").decode()\n\n # UTF-16 encoding\n encodings[\"utf-16\"] = decoded_str.encode(\"utf-16\").decode(\"utf-16\")\n\n # UTF-32 encoding\n encodings[\"utf-32\"] = decoded_str.encode(\"utf-32\").decode(\"utf-32\")\n\n # ASCII encoding - only if characters are in ASCII range\n try:\n encodings[\"ASCII\"] = decoded_str.encode(\"ascii\").decode()\n except UnicodeEncodeError:\n encodings[\"ASCII\"] = \"Not representable in ASCII\"\n\n # URL encoding\n encodings[\"URL\"] = urllib.parse.quote(decoded_str)\n\n # ROT13 encoding\n encodings[\"ROT13\"] = codecs.encode(decoded_str, \"rot_13\")\n\n return encodings", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_833\"\"\"\n def test_hex_string_sample(self):\n \"\"\"Test the sample input from the problem description.\"\"\"\n hex_str = \"4a4b4c\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"SktM\")\n self.assertEqual(result[\"utf-8\"], \"JKL\")\n self.assertEqual(result[\"utf-16\"], \"JKL\")\n self.assertEqual(result[\"utf-32\"], \"JKL\")\n self.assertEqual(result[\"ASCII\"], \"JKL\")\n self.assertEqual(result[\"URL\"], \"JKL\")\n self.assertEqual(result[\"ROT13\"], \"WXY\")\n def test_hex_string_1(self):\n \"\"\"Test a hex string with a mix of letters and numbers.\"\"\"\n hex_str = \"68656c6c6f\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"aGVsbG8=\")\n self.assertEqual(result[\"utf-8\"], \"hello\")\n self.assertEqual(result[\"utf-16\"], \"hello\")\n self.assertEqual(result[\"utf-32\"], \"hello\")\n self.assertEqual(result[\"ASCII\"], \"hello\")\n self.assertEqual(result[\"URL\"], \"hello\")\n self.assertEqual(result[\"ROT13\"], \"uryyb\")\n def test_hex_string_2(self):\n \"\"\"Test a hex string with a mix of letters and numbers.\"\"\"\n hex_str = \"776f726c64\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"d29ybGQ=\")\n self.assertEqual(result[\"utf-8\"], \"world\")\n self.assertEqual(result[\"utf-16\"], \"world\")\n self.assertEqual(result[\"utf-32\"], \"world\")\n self.assertEqual(result[\"ASCII\"], \"world\")\n self.assertEqual(result[\"URL\"], \"world\")\n self.assertEqual(result[\"ROT13\"], \"jbeyq\")\n def test_hex_string_3(self):\n \"\"\"Test a hex string with a mix of letters and numbers.\"\"\"\n hex_str = \"616263\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"YWJj\")\n self.assertEqual(result[\"utf-8\"], \"abc\")\n self.assertEqual(result[\"utf-16\"], \"abc\")\n self.assertEqual(result[\"utf-32\"], \"abc\")\n self.assertEqual(result[\"ASCII\"], \"abc\")\n self.assertEqual(result[\"URL\"], \"abc\")\n self.assertEqual(result[\"ROT13\"], \"nop\")\n def test_hex_string_4(self):\n \"\"\"Test a hex string with a mix of letters and numbers.\"\"\"\n hex_str = \"313233\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"MTIz\")\n self.assertEqual(result[\"utf-8\"], \"123\")\n self.assertEqual(result[\"utf-16\"], \"123\")\n self.assertEqual(result[\"utf-32\"], \"123\")\n self.assertEqual(result[\"ASCII\"], \"123\")\n self.assertEqual(result[\"URL\"], \"123\")\n self.assertEqual(result[\"ROT13\"], \"123\")\n def test_hex_string_non_ascii(self):\n \"\"\"Test a hex string with non-ASCII characters.\"\"\"\n hex_str = \"c3a9\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"w6k=\")\n self.assertEqual(result[\"utf-8\"], \"\u00e9\")\n self.assertEqual(result[\"utf-16\"], \"\u00e9\")\n self.assertEqual(result[\"utf-32\"], \"\u00e9\")\n self.assertEqual(result[\"ASCII\"], \"Not representable in ASCII\")\n self.assertEqual(result[\"URL\"], \"%C3%A9\")\n self.assertEqual(result[\"ROT13\"], \"\u00e9\")", "apis": ["urllib.parse", "binascii.hexlify", "base64.b64encode", "codecs.encode", "urllib.parse.quote"], "libs": ["urllib", "binascii", "base64", "codecs"], "doc": {"description": ["Convert a hexadecimal string to various encodings.", "This function takes a hexadecimal string as input and performs several encoding operations.", "Initially, it decodes the hexadecimal string to bytes and then converts these bytes into a UTF-8 string.", "This UTF-8 string is subsequently encoded into different formats: hexadecimal, base64, UTF-8, UTF-16,", "UTF-32, ASCII (if possible), URL encoding, and ROT13. Note that if ASCII not possible, returns 'Not representable in ASCII'.", ">>> f_833(\"68656c6c6f\")", "{'hex': '68656c6c6f', 'base64': 'aGVsbG8=', 'utf-8': 'hello', 'utf-16': 'hello', 'utf-32': 'hello', 'ASCII': 'hello', 'URL': 'hello', 'ROT13': 'uryyb'}"], "note": [], "params": ["hex_string (str): The input string in hexadecimal format."], "returns": ["dict: A dictionary containing the input string encoded in various formats. The dictionary's keys", "are the encoding types ('hex', 'base64', 'utf-8', 'utf-16', 'utf-32', 'ASCII', 'URL', 'ROT13'),", "and the values are the corresponding encoded strings. If the string cannot be represented in ASCII,", "the 'ASCII' key maps to 'Not representable in ASCII'."], "reqs": ["binascii", "base64", "urllib", "codecs"], "raises": [], "example": [">>> f_833(\"4a4b4c\")", "{'hex': '4a4b4c', 'base64': 'SktM', 'utf-8': 'JKL', 'utf-16': 'JKL', 'utf-32': 'JKL', 'ASCII': 'JKL', 'URL': 'JKL', 'ROT13': 'WXY'}"]}} -{"task_id": "f_415", "prompt": "import json\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nfrom collections import defaultdict\n\n\ndef f_415(input_file: str) -> plt.Axes:\n \"\"\"\n Read a list of dictionaries from a JSON file, calculate the results (mean and median for each key)\n via numpy, convert the input data into a pandas DataFrame with the keys as \"X\" and values as \"Y\"\n for visualization with a seaborn box plot, then return the results and box plot.\n\n Parameters:\n - input_file (str): The input JSON file name with absolute path.\n\n Returns:\n - results (dict): Dictionary where each key is a unique key from the original input, and each\n value is a corresponding dict, with keys 'mean' and 'median' and the statistics\n as values.\n - ax (plt.Axes): The box plot of aggregated 'Values for Each Key' in the input data.\n\n Requirements:\n - json\n - seaborn\n - matplotlib.pyplot\n - pandas\n - numpy\n - collections.defaultdict\n\n Example:\n >>> results, ax = f_415(\"/path/to/data.json\")\n >>> ax\n \n >>> results\n {'a': {'mean': 3.0, 'median': 3.0}, 'b': {'mean': 2.0, 'median': 3.0}}\n \"\"\"", "canonical_solution": " with open(input_file, \"r\") as f:\n data = json.load(f)\n\n stats = defaultdict(list)\n for d in data:\n for key, value in d.items():\n stats[key].append(value)\n\n results = {\n k: {\"mean\": np.mean(v), \"median\": np.median(v)} for k, v in stats.items()\n }\n\n data = pd.DataFrame(data).melt(var_name=\"X\", value_name=\"Y\")\n ax = sns.boxplot(data=data, x=\"X\", y=\"Y\")\n ax.set_title(\"Boxplot of Values for Each Key\")\n return results, ax", "test": "import unittest\nimport os\nimport tempfile\nimport matplotlib.pyplot as plt\nimport json\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n # Setup a temporary directory and write sample JSON data to a temp file\n cls.temp_dir = tempfile.TemporaryDirectory()\n cls.sample_data_file = os.path.join(cls.temp_dir.name, \"sample_data.json\")\n cls.sample_data = [\n {\"A\": 10, \"B\": 20, \"C\": 30},\n {\"A\": 15, \"B\": 25, \"C\": 35},\n {\"A\": 20, \"B\": 30, \"C\": 40},\n ]\n with open(cls.sample_data_file, \"w\") as f:\n json.dump(cls.sample_data, f)\n # Create an invalid JSON file for testing\n cls.invalid_json_file = os.path.join(cls.temp_dir.name, \"invalid.json\")\n with open(cls.invalid_json_file, \"w\") as f:\n f.write(\"invalid content\")\n @classmethod\n def tearDownClass(cls):\n cls.temp_dir.cleanup()\n plt.close(\"all\")\n def test_case_1(self):\n # Test if the function can read the JSON data file and return a plot\n _, ax = f_415(self.sample_data_file)\n self.assertIsInstance(ax, plt.Axes, \"The function should return a plot (Axes).\")\n self.assertTrue(len(ax.get_xticks()) > 0, \"The plot should have x-axis ticks.\")\n self.assertTrue(len(ax.get_yticks()) > 0, \"The plot should have y-axis ticks.\")\n self.assertTrue(ax.get_title(), \"Boxplot of Values for Each Key\")\n def test_case_2(self):\n # Check result correctness\n results, _ = f_415(self.sample_data_file)\n self.assertIn(\"A\", results)\n self.assertIn(\"B\", results)\n self.assertIn(\"C\", results)\n self.assertEqual(results[\"A\"][\"mean\"], 15.0)\n self.assertEqual(results[\"A\"][\"median\"], 15.0)\n self.assertEqual(results[\"B\"][\"mean\"], 25.0)\n self.assertEqual(results[\"B\"][\"median\"], 25.0)\n self.assertEqual(results[\"C\"][\"mean\"], 35.0)\n self.assertEqual(results[\"C\"][\"median\"], 35.0)\n def test_case_3(self):\n # Test the correctness of the x-axis labels\n _, ax = f_415(self.sample_data_file)\n x_labels = [label.get_text() for label in ax.get_xticklabels()]\n expected_x_labels = [\"A\", \"B\", \"C\"]\n self.assertListEqual(\n x_labels, expected_x_labels, \"The x-axis labels are not as expected.\"\n )\n def test_case_4(self):\n # Test the correctness of the y-axis data points\n _, ax = f_415(self.sample_data_file)\n # Correctly extract the height of the boxes in the box plot\n boxes = [\n box.get_height() for box in ax.containers if hasattr(box, \"get_height\")\n ]\n self.assertTrue(\n all(height > 0 for height in boxes),\n \"Each box plot should have y-data points.\",\n )\n def test_case_5(self):\n # Test if the function raises an error for non-existent file\n with self.assertRaises(FileNotFoundError):\n f_415(os.path.join(self.temp_dir.name, \"non_existent.json\"))\n def test_case_6(self):\n # Test if the function raises an error for invalid JSON format\n with self.assertRaises(json.JSONDecodeError):\n f_415(os.path.join(self.temp_dir.name, \"invalid.json\"))", "apis": ["numpy.median", "seaborn.boxplot", "numpy.mean", "matplotlib.pyplot.Axes", "json.load", "collections.defaultdict", "pandas.DataFrame"], "libs": ["numpy", "collections", "json", "matplotlib", "pandas", "seaborn"], "doc": {"description": ["Read a list of dictionaries from a JSON file, calculate the results (mean and median for each key)", "via numpy, convert the input data into a pandas DataFrame with the keys as \"X\" and values as \"Y\"", "for visualization with a seaborn box plot, then return the results and box plot."], "note": [], "params": ["input_file (str): The input JSON file name with absolute path."], "returns": ["results (dict): Dictionary where each key is a unique key from the original input, and each", "value is a corresponding dict, with keys 'mean' and 'median' and the statistics", "as values.", "ax (plt.Axes): The box plot of aggregated 'Values for Each Key' in the input data."], "reqs": ["json", "seaborn", "matplotlib.pyplot", "pandas", "numpy", "collections.defaultdict"], "raises": [], "example": [">>> results, ax = f_415(\"/path/to/data.json\")", ">>> ax", "", ">>> results", "{'a': {'mean': 3.0, 'median': 3.0}, 'b': {'mean': 2.0, 'median': 3.0}}"]}} -{"task_id": "f_821", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.decomposition import PCA\n\n\ndef f_821(array, seed=None):\n \"\"\"\n Shuffles the columns of a numpy array randomly, performs Principal Component Analysis (PCA)\n to reduce the dimensionality to 2 principal components, and returns these components as a pandas DataFrame.\n\n Parameters:\n - array (numpy.ndarray): A 2D numpy array where each row is an observation and each column is a feature.\n - seed (int, optional): Seed for the random number generator. Defaults to None (not set).\n\n Returns:\n - pandas.DataFrame: DataFrame with columns 'PC1' and 'PC2' representing the two principal components.\n\n Raises:\n - ValueError: If the input array is not 2D.\n\n Requirements:\n - numpy\n - pandas\n - sklearn\n\n Note:\n - PCA reduction will default to the number of features if fewer than 2.\n - An named but empty DataFrame is returned for arrays without features or with empty content.\n\n Examples:\n >>> array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n >>> df = f_821(array, seed=42)\n >>> df\n PC1 PC2\n 0 5.59017 4.440892e-16\n 1 -5.59017 4.440892e-16\n >>> df.shape\n (2, 2)\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n\n if not isinstance(array, np.ndarray) or len(array.shape) != 2:\n raise ValueError(\"Input must be a 2D numpy array.\")\n\n if array.size == 0 or array.shape[1] == 0:\n return pd.DataFrame(columns=[\"PC1\", \"PC2\"])\n\n shuffled_array = np.copy(array)\n np.random.shuffle(np.transpose(shuffled_array))\n\n n_components = min(2, shuffled_array.shape[1])\n pca = PCA(n_components=n_components)\n principal_components = pca.fit_transform(shuffled_array)\n\n column_labels = [\"PC1\", \"PC2\"][:n_components]\n df = pd.DataFrame(data=principal_components, columns=column_labels)\n\n return df", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.array2x5 = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.array5x1 = np.array([[1], [2], [3], [4], [5]])\n def test_with_empty_array(self):\n \"\"\"Test handling of an empty array.\"\"\"\n array = np.empty((0, 0))\n df = f_821(array, seed=42)\n self.assertTrue(df.empty, \"The returned DataFrame should be empty.\")\n self.assertTrue(\n (df.columns == [\"PC1\", \"PC2\"]).all(),\n \"Column names should be 'PC1' and 'PC2' even for an empty DataFrame.\",\n )\n def test_with_2x5_array(self):\n \"\"\"Test PCA on a 2x5 array with shuffled columns.\"\"\"\n df = f_821(self.array2x5, seed=42)\n self.assertEqual(df.shape, (2, 2), \"DataFrame shape should be (2, 2).\")\n self.assertTrue(\n (df.columns == [\"PC1\", \"PC2\"]).all(),\n \"Column names should be 'PC1' and 'PC2'.\",\n )\n def test_with_5x1_array(self):\n \"\"\"Test PCA on a 5x1 array.\"\"\"\n df = f_821(self.array5x1, seed=0)\n self.assertEqual(\n df.shape, (5, 1), \"DataFrame shape should be (5, 1) for a single component.\"\n )\n self.assertTrue(\n (df.columns == [\"PC1\"]).all(),\n \"Column name should be 'PC1' for a single component.\",\n )\n def test_invalid_input(self):\n \"\"\"Test handling of invalid input.\"\"\"\n with self.assertRaises(ValueError):\n f_821(np.array([1, 2, 3]), seed=42)\n def test_reproducibility(self):\n \"\"\"Test if the function is reproducible with the same seed.\"\"\"\n df1 = f_821(self.array2x5, seed=42)\n df2 = f_821(self.array2x5, seed=42)\n pd.testing.assert_frame_equal(\n df1, df2, \"Results should be identical when using the same seed.\"\n )\n def test_pca_correctness(self):\n \"\"\"\n Test PCA correctness by ensuring that the variance is captured correctly\n in the principal components.\n \"\"\"\n # Creating a simple array where variance is higher in one dimension\n # This dataset is designed so that the first principal component should\n # capture the majority of the variance.\n array = np.array(\n [\n [1, 2, 3, 4, 5],\n [1, 2, 3, 4, 5],\n [1, 2, 3, 4, 5],\n [1, 2, 3, 4, 5],\n [10, 10, 10, 10, 10],\n ]\n ) # Increased variance in the last row\n df = f_821(array, seed=0)\n # The PCA should be able to capture the variance in the first principal component\n # significantly more than in the second, if applicable.\n # Asserting that the first PC values are not all the same,\n # which indicates it captured the variance.\n self.assertFalse(\n df[\"PC1\"].std() == 0,\n \"PCA should capture variance along the first principal component.\",\n )", "apis": ["numpy.ndarray", "numpy.random", "numpy.copy", "numpy.random.seed", "numpy.transpose", "numpy.random.shuffle", "sklearn.decomposition.PCA", "pandas.DataFrame"], "libs": ["numpy", "pandas", "sklearn"], "doc": {"description": ["Shuffles the columns of a numpy array randomly, performs Principal Component Analysis (PCA)", "to reduce the dimensionality to 2 principal components, and returns these components as a pandas DataFrame."], "note": ["PCA reduction will default to the number of features if fewer than 2.", "An named but empty DataFrame is returned for arrays without features or with empty content."], "params": ["array (numpy.ndarray): A 2D numpy array where each row is an observation and each column is a feature.", "seed (int, optional): Seed for the random number generator. Defaults to None (not set)."], "returns": ["pandas.DataFrame: DataFrame with columns 'PC1' and 'PC2' representing the two principal components."], "reqs": ["numpy", "pandas", "sklearn"], "raises": ["ValueError: If the input array is not 2D."], "example": ["Examples:", ">>> array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])", ">>> df = f_821(array, seed=42)", ">>> df", "PC1 PC2", "0 5.59017 4.440892e-16", "1 -5.59017 4.440892e-16", ">>> df.shape", "(2, 2)"]}} -{"task_id": "f_393", "prompt": "from datetime import datetime, timedelta\nimport numpy as np\nimport matplotlib.pyplot as plt\n\ndef f_393(days_in_past=7, random_seed=0):\n \"\"\"\n Draw a graph of temperature trends over the past week using randomly generated data.\n\n This function generates random integer temperatures in Celcius with a low of 15 and high of 35.\n To show temperature trend, it plots date on the x-axis and temperature on the y-axis.\n\n Parameters:\n days_in_past (int, optional): The number of days in the past for which to generate the graph.\n Defaults to 7 days.\n random_seed (int, optional): Seed for random number generation. Defaults to 0.\n\n Returns:\n ax (matplotlib.axes._axes.Axes): Generated plot showing 'Temperature Trends Over the Past Week',\n with 'Date' on the a-xis and 'Temperature (\u00b0C)' on the y-axis.\n\n\n Requirements:\n - datetime.datetime\n - datetime.timedelta\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_393(random_seed=42)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(19810.0, 0, '2024-03-28'), Text(19811.0, 0, '2024-03-29'), Text(19812.0, 0, '2024-03-30'), Text(19813.0, 0, '2024-03-31'), Text(19814.0, 0, '2024-04-01'), Text(19815.0, 0, '2024-04-02'), Text(19816.0, 0, '2024-04-03')]\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n\n if days_in_past < 1:\n raise ValueError(\"days_in_past must be in the past\")\n\n dates = [datetime.now().date() - timedelta(days=i) for i in range(days_in_past)]\n temperatures = np.random.randint(low=15, high=35, size=days_in_past)\n\n fig, ax = plt.subplots()\n ax.plot(dates, temperatures)\n ax.set_xlabel(\"Date\")\n ax.set_ylabel(\"Temperature (\u00b0C)\")\n ax.set_title(\"Temperature Trend\")\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def _test_plot(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_xlabel(), \"Date\")\n self.assertEqual(ax.get_ylabel(), \"Temperature (\u00b0C)\")\n self.assertEqual(ax.get_title(), \"Temperature Trend\")\n def test_case_1(self):\n # Test default parameters\n ax = f_393()\n self._test_plot(ax)\n def test_case_2(self):\n # Test days in the past\n for n_days in [1, 5, 50, 100]:\n ax = f_393(n_days, random_seed=2)\n self._test_plot(ax)\n self.assertEqual(len(ax.lines[0].get_ydata()), n_days)\n def test_case_3(self):\n # Test handling invalid days in the past\n with self.assertRaises(Exception):\n f_393(0, random_seed=4)\n def test_case_4(self):\n # Test handling invalid days in the past\n with self.assertRaises(Exception):\n f_393(-1, random_seed=4)\n def test_case_5(self):\n # Test random seed reproducibility\n ax1 = f_393(5, random_seed=42)\n ax2 = f_393(5, random_seed=42)\n self.assertTrue(\n np.array_equal(ax1.lines[0].get_ydata(), ax2.lines[0].get_ydata())\n )\n def test_case_6(self):\n # Test random seed difference\n ax1 = f_393(5, random_seed=0)\n ax2 = f_393(5, random_seed=42)\n self.assertFalse(\n np.array_equal(ax1.lines[0].get_ydata(), ax2.lines[0].get_ydata())\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.random.randint", "numpy.random", "numpy.random.seed", "datetime.datetime.now", "datetime.timedelta", "matplotlib.pyplot.subplots"], "libs": ["numpy", "matplotlib", "datetime"], "doc": {"description": ["Draw a graph of temperature trends over the past week using randomly generated data.", "This function generates random integer temperatures in Celcius with a low of 15 and high of 35.", "To show temperature trend, it plots date on the x-axis and temperature on the y-axis."], "note": [], "params": ["days_in_past (int, optional): The number of days in the past for which to generate the graph.", "Defaults to 7 days.", "random_seed (int, optional): Seed for random number generation. Defaults to 0."], "returns": ["ax (matplotlib.axes._axes.Axes): Generated plot showing 'Temperature Trends Over the Past Week',", "with 'Date' on the a-xis and 'Temperature (\u00b0C)' on the y-axis."], "reqs": ["datetime.datetime", "datetime.timedelta", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_393(random_seed=42)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(19810.0, 0, '2024-03-28'), Text(19811.0, 0, '2024-03-29'), Text(19812.0, 0, '2024-03-30'), Text(19813.0, 0, '2024-03-31'), Text(19814.0, 0, '2024-04-01'), Text(19815.0, 0, '2024-04-02'), Text(19816.0, 0, '2024-04-03')]"]}} -{"task_id": "f_878", "prompt": "import pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import confusion_matrix\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_878(feature: pd.Series, target: pd.Series) -> (np.ndarray, plt.Axes):\n \"\"\"\n Train a logistic regression model on one feature and evaluate its performance using a confusion matrix plot.\n The function takes a feature and a target series, splits them into training and testing sets, trains the logistic\n regression model, predicts the target for the test set, and plots the confusion matrix.\n\n Parameters:\n feature (pd.Series): Series representing the single feature for the logistic regression model.\n target (pd.Series): Series representing the target variable.\n\n Returns:\n (np.ndarray, plt.Axes): A tuple containing the confusion matrix and the matplotlib Axes object of the confusion matrix plot.\n\n Requirements:\n - pandas\n - sklearn.model_selection.train_test_split\n - sklearn.linear_model.LogisticRegression\n - sklearn.metrics.confusion_matrix\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> feature = pd.Series(np.random.rand(1000)) # Feature data\n >>> target = pd.Series(np.random.randint(0, 2, size=1000)) # Target data (binary)\n >>> cm, ax = f_878(feature, target)\n >>> ax.get_title()\n 'Confusion Matrix'\n \"\"\"", "canonical_solution": " # Create DataFrame from the series\n df = pd.DataFrame({\"Feature\": feature, \"Target\": target})\n\n # Split the data into train and test datasets\n X_train, X_test, y_train, y_test = train_test_split(\n df[\"Feature\"], df[\"Target\"], test_size=0.2, random_state=42\n )\n\n # Initialize and train the Logistic Regression model\n model = LogisticRegression()\n model.fit(X_train.values.reshape(-1, 1), y_train)\n\n # Make predictions\n y_pred = model.predict(X_test.values.reshape(-1, 1))\n\n # Compute the confusion matrix\n cm = confusion_matrix(y_test, y_pred)\n\n # Plot the confusion matrix\n _, ax = plt.subplots()\n cax = ax.matshow(cm, cmap=\"Blues\")\n plt.title(\"Confusion Matrix\")\n plt.xlabel(\"Predicted\")\n plt.ylabel(\"Actual\")\n plt.colorbar(cax)\n\n # Setting tick locations\n ax.set_xticks([0, 1])\n ax.set_yticks([0, 1])\n\n # Now set tick labels correctly\n ax.set_xticklabels([\"No\", \"Yes\"])\n ax.set_yticklabels([\"No\", \"Yes\"])\n\n return cm, ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_878.\"\"\"\n def test_with_random_data(self):\n \"\"\"\n Test the function with random data to ensure normal functionality.\n \"\"\"\n np.random.seed(42)\n feature = pd.Series(np.random.rand(100))\n np.random.seed(42)\n target = pd.Series(np.random.randint(0, 2, size=100))\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def test_with_all_zeroes(self):\n \"\"\"\n Test the function with all zeroes in the feature set.\n \"\"\"\n feature = pd.Series(np.zeros(100))\n np.random.seed(123)\n target = pd.Series(np.random.randint(0, 2, size=100))\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def test_with_all_ones(self):\n \"\"\"\n Test the function with all ones in the feature set.\n \"\"\"\n feature = pd.Series(np.ones(100))\n np.random.seed(42)\n target = pd.Series(np.random.randint(0, 2, size=100))\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def test_with_perfect_correlation(self):\n \"\"\"\n Test the function when the feature perfectly predicts the target.\n \"\"\"\n np.random.seed(123)\n feature = pd.Series(np.random.rand(100))\n target = feature.round()\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def test_with_no_correlation(self):\n \"\"\"\n Test the function when there is no correlation between feature and target.\n \"\"\"\n np.random.seed(42)\n feature = pd.Series(np.random.rand(100))\n np.random.seed(42)\n target = pd.Series(np.random.choice([0, 1], size=100))\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def tearDown(self):\n plt.close()", "apis": ["sklearn.model_selection.train_test_split", "numpy.ndarray", "matplotlib.pyplot.colorbar", "matplotlib.pyplot.xlabel", "matplotlib.pyplot.Axes", "pandas.Series", "sklearn.metrics.confusion_matrix", "matplotlib.pyplot.title", "matplotlib.pyplot.ylabel", "pandas.DataFrame", "matplotlib.pyplot.subplots", "sklearn.linear_model.LogisticRegression"], "libs": ["numpy", "matplotlib", "pandas", "sklearn"], "doc": {"description": ["Train a logistic regression model on one feature and evaluate its performance using a confusion matrix plot.", "The function takes a feature and a target series, splits them into training and testing sets, trains the logistic", "regression model, predicts the target for the test set, and plots the confusion matrix."], "note": [], "params": ["feature (pd.Series): Series representing the single feature for the logistic regression model.", "target (pd.Series): Series representing the target variable."], "returns": ["(np.ndarray, plt.Axes): A tuple containing the confusion matrix and the matplotlib Axes object of the confusion matrix plot."], "reqs": ["pandas", "sklearn.model_selection.train_test_split", "sklearn.linear_model.LogisticRegression", "sklearn.metrics.confusion_matrix", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> feature = pd.Series(np.random.rand(1000)) # Feature data", ">>> target = pd.Series(np.random.randint(0, 2, size=1000)) # Target data (binary)", ">>> cm, ax = f_878(feature, target)", ">>> ax.get_title()", "'Confusion Matrix'"]}} +{"task_id": "f_361", "prompt": "import subprocess\nimport os\nimport time\nfrom datetime import datetime\n\n\ndef f_361(script_dir, scripts, delay):\n \"\"\"\n Execute a list of bash scripts with a specified delay between each script.\n\n Parameters:\n script_dir (str): Path to the directory containing the scripts.\n scripts (list): List of script filenames to be executed. Must not be empty.\n If a script is not found, the function raises a FileNotFoundError.\n delay (int): The delay in seconds between each script execution. Must at least 0.\n\n Returns:\n list: A list of timestamps indicating the start time of each script execution.\n\n Requirements:\n - subprocess\n - os\n - time\n - datetime.datetime\n\n Example:\n >>> f_361('/path/to/scripts/', ['script1.sh', 'script2.sh'], 5)\n ['2023-09-09 10:10:10', '2023-09-09 10:10:15']\n \"\"\"", "canonical_solution": " if delay < 0:\n raise ValueError(\"delay cannot be negative.\")\n if not scripts:\n raise ValueError(\"No scripts provided.\")\n start_times = []\n for script in scripts:\n script_path = os.path.join(script_dir, script)\n start_time = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n start_times.append(start_time)\n\n result = subprocess.call(script_path, shell=True)\n if result != 0:\n raise FileNotFoundError(f\"Script not found: {script_path}\")\n\n time.sleep(delay)\n return start_times", "test": "import unittest\nimport tempfile\nimport os\nfrom datetime import datetime\nclass TestCases(unittest.TestCase):\n def setUp(self):\n # Create a temporary directory to store scripts\n self.temp_dir = tempfile.TemporaryDirectory()\n self.script_dir = self.temp_dir.name\n def tearDown(self):\n # Clean up the temporary directory\n self.temp_dir.cleanup()\n def create_temp_script(self, script_content):\n # Helper function to create a temporary script file with the given content\n fd, path = tempfile.mkstemp(dir=self.script_dir, suffix=\".sh\")\n with os.fdopen(fd, \"w\") as f:\n f.write(\"#!/bin/bash\\n\")\n f.write(script_content)\n os.chmod(path, 0o755)\n return os.path.basename(path)\n def test_case_1(self):\n # Testing with a single script and delay of 1 second\n script_name = self.create_temp_script(\"echo 'Test'\")\n scripts = [script_name]\n delay = 1\n start_times = f_361(self.script_dir, scripts, delay)\n self.assertEqual(len(start_times), 1)\n self.assertTrue(\n isinstance(datetime.strptime(start_times[0], \"%Y-%m-%d %H:%M:%S\"), datetime)\n )\n def test_case_2(self):\n # Testing with multiple scripts and a longer delay\n script_names = [\n self.create_temp_script(\"echo 'Test'\"),\n self.create_temp_script(\"echo 'Test 2'\"),\n ]\n delay = 2\n start_times = f_361(self.script_dir, script_names, delay)\n self.assertEqual(len(start_times), 2)\n time_diff = datetime.strptime(\n start_times[1], \"%Y-%m-%d %H:%M:%S\"\n ) - datetime.strptime(start_times[0], \"%Y-%m-%d %H:%M:%S\")\n self.assertEqual(time_diff.seconds, delay)\n def test_case_3(self):\n # Testing with an invalid script path\n with self.assertRaises(FileNotFoundError):\n f_361(self.script_dir, [\"this-doesn't-exist\"], 1)\n def test_case_4(self):\n # Testing with no scripts (empty list)\n with self.assertRaises(Exception):\n f_361(self.script_dir, [], 1)\n def test_case_5(self):\n # Testing with zero delay\n script_names = [\n self.create_temp_script(\"echo 'Test'\"),\n self.create_temp_script(\"echo 'Test 2'\"),\n ]\n delay = 0\n start_times = f_361(self.script_dir, script_names, delay)\n self.assertEqual(len(start_times), 2)\n def test_case_6(self):\n # Test handling invalid delay\n script_names = [\n self.create_temp_script(\"echo 'Test'\"),\n self.create_temp_script(\"echo 'Test 2'\"),\n ]\n with self.assertRaises(Exception):\n f_361(self.script_dir, script_names, -1)", "apis": ["datetime.datetime.now", "os.path", "subprocess.call", "time.sleep", "os.path.join"], "libs": ["subprocess", "os", "time", "datetime"], "doc": {"description": ["Execute a list of bash scripts with a specified delay between each script."], "note": [], "params": ["script_dir (str): Path to the directory containing the scripts.", "scripts (list): List of script filenames to be executed. Must not be empty.", "If a script is not found, the function raises a FileNotFoundError.", "delay (int): The delay in seconds between each script execution. Must at least 0."], "returns": ["list: A list of timestamps indicating the start time of each script execution."], "reqs": ["subprocess", "os", "time", "datetime.datetime"], "raises": [], "example": [">>> f_361('/path/to/scripts/', ['script1.sh', 'script2.sh'], 5)", "['2023-09-09 10:10:10', '2023-09-09 10:10:15']"]}} +{"task_id": "f_754", "prompt": "from collections import Counter\nimport itertools\n\ndef f_754(letters: list, repetitions: int) -> dict:\n \"\"\"\n Count the frequency of each letter in a list after repeating it a given number of times.\n\n Input:\n - letters (list): A list of single-character strings representing letters.\n - repetitions (int): The number of times to repeat the list.\n\n Output:\n Returns a dictionary where the keys are the letters and the values are their frequencies.\n\n Requirements:\n - collections.Counter\n - itertools\n\n Example:\n >>> f_754(['A', 'B', 'C'], 2)\n {'A': 2, 'B': 2, 'C': 2}\n >>> f_754(['A', 'B'], 3)\n {'A': 3, 'B': 3}\n \"\"\"", "canonical_solution": " # Create a flattened list by repeating the original list\n flattened_list = list(itertools.chain(*[letters for _ in range(repetitions)]))\n \n # Count the occurrences of each letter in the flattened list\n counts = dict(Counter(flattened_list))\n \n return counts", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \n def test_case_1(self):\n result = f_754(['A', 'B', 'C'], 2)\n expected = {'A': 2, 'B': 2, 'C': 2}\n self.assertEqual(result, expected)\n \n def test_case_2(self):\n result = f_754(['A', 'B'], 3)\n expected = {'A': 3, 'B': 3}\n self.assertEqual(result, expected)\n \n def test_case_3(self):\n result = f_754([], 2)\n expected = {}\n self.assertEqual(result, expected)\n \n def test_case_4(self):\n result = f_754(['A', 'B', 'A'], 2)\n expected = {'A': 4, 'B': 2}\n self.assertEqual(result, expected)\n \n def test_case_5(self):\n result = f_754(['A'], 0)\n expected = {}\n self.assertEqual(result, expected)", "apis": ["collections.Counter", "itertools.chain"], "libs": ["itertools", "collections"], "doc": {"description": ["Count the frequency of each letter in a list after repeating it a given number of times.", "Input:", "- letters (list): A list of single-character strings representing letters.", "- repetitions (int): The number of times to repeat the list.", "Output:", "Returns a dictionary where the keys are the letters and the values are their frequencies."], "note": [], "params": [], "returns": [], "reqs": ["collections.Counter", "itertools"], "raises": [], "example": [">>> f_754(['A', 'B', 'C'], 2)", "{'A': 2, 'B': 2, 'C': 2}", ">>> f_754(['A', 'B'], 3)", "{'A': 3, 'B': 3}"]}} +{"task_id": "f_884", "prompt": "import socket\nimport select\nimport queue\nfrom datetime import datetime, timedelta\n\n\ndef f_884(\n server_address=\"localhost\", server_port=12345, buffer_size=1024, run_duration=5\n):\n \"\"\"\n Run a non-blocking echo server that appends the server's current time to received data and sends it back to the client, while handling exceptional conditions for each socket.\n\n Parameters:\n - server_address (str): The address for the server to listen on. Default is 'localhost'.\n - server_port (int): The port for the server to listen on. Default is 12345.\n - buffer_size (int): The buffer size for data reception. Default is 1024 bytes.\n - run_duration (int): The duration (in seconds) for which the server will run. Default is 5 seconds.\n\n Returns:\n - str: A status message indicating the server's operation and run duration.\n\n Requirements:\n - socket\n - select\n - queue\n - datetime\n\n Example:\n >>> print(f_884())\n 'Server started on localhost:12345. Ran for 5 seconds.'\n \"\"\"", "canonical_solution": " server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n server.setblocking(0)\n server.bind((server_address, server_port))\n server.listen(5)\n inputs = [server]\n outputs = []\n message_queues = {}\n end_time = datetime.now() + timedelta(seconds=run_duration)\n\n try:\n while datetime.now() < end_time:\n readable, writable, _ = select.select(inputs, outputs, inputs, 1)\n for s in readable:\n if s is server:\n connection, _ = s.accept()\n connection.setblocking(0)\n inputs.append(connection)\n message_queues[connection] = queue.Queue()\n else:\n data = s.recv(buffer_size)\n if data:\n message_queues[s].put(f\"{datetime.now()}: {data.decode()}\")\n if s not in outputs:\n outputs.append(s)\n else:\n if s in outputs:\n outputs.remove(s)\n inputs.remove(s)\n s.close()\n del message_queues[s]\n\n for s in writable:\n if s not in message_queues:\n continue # Skip if socket's queue has been removed\n\n try:\n next_msg = message_queues[s].get_nowait()\n except queue.Empty:\n outputs.remove(s)\n else:\n s.sendall(next_msg.encode(\"utf-8\"))\n\n finally:\n server.close()\n\n return f\"Server started on {server_address}:{server_port}. Ran for {run_duration} seconds.\"", "test": "import unittest\nimport socket\nimport time\nimport threading\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_884 function.\"\"\"\n def setUp(self):\n # Start the server in a separate thread\n self.server_thread = threading.Thread(\n target=f_884, args=(\"localhost\", 12345, 1024, 10)\n )\n self.server_thread.start()\n time.sleep(1)\n def tearDown(self):\n # Ensure the server thread is closed after each test\n self.server_thread.join()\n def test_queue_empty_condition(self):\n \"\"\"Test if the server correctly handles an empty queue condition.\"\"\"\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n # Send a message and then close the socket immediately\n client.sendall(\"Hello\".encode())\n client.close()\n # The server should handle the empty queue condition without crashing\n # Wait briefly to allow server to process the situation\n time.sleep(1)\n # Since the server should continue running and not crash,\n # we can attempt a new connection to check server's state\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as new_client:\n new_client.connect((\"localhost\", 12345))\n test_message = \"Test after empty queue\"\n new_client.sendall(test_message.encode())\n response = new_client.recv(1024).decode()\n self.assertIn(test_message, response)\n def test_server_response(self):\n \"\"\"Test if server correctly echoes received data with server time.\"\"\"\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n test_message = \"Hello, Server!\"\n client.sendall(test_message.encode())\n response = client.recv(1024).decode()\n self.assertIn(test_message, response)\n def test_multiple_connections(self):\n \"\"\"Test the server's ability to handle multiple client connections.\"\"\"\n responses = []\n for _ in range(5):\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n client.sendall(\"Test\".encode())\n responses.append(client.recv(1024).decode())\n for response in responses:\n # Assuming the server response format includes the timestamp followed by the echoed message\n self.assertTrue(\"Test\" in response)\n def test_no_data_received(self):\n \"\"\"Test server behavior when no data is received from the client.\"\"\"\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n # Not sending any data\n client.settimeout(2)\n with self.assertRaises(socket.timeout):\n client.recv(1024)\n def test_server_closes_after_duration(self):\n \"\"\"Test if the server closes after the specified duration.\"\"\"\n # Wait for a duration longer than the server's run time\n time.sleep(5)\n with self.assertRaises((socket.timeout, ConnectionRefusedError)):\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.settimeout(2)\n client.connect((\"localhost\", 12345))\n client.recv(1024)\n def test_large_data_transfer(self):\n \"\"\"Test the server's ability to handle a large data transfer.\"\"\"\n large_data = \"A\" * 1000\n with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:\n client.connect((\"localhost\", 12345))\n client.sendall(large_data.encode())\n # Initialize an empty string to accumulate the response\n total_response = \"\"\n while True:\n # Receive data in chunks\n part = client.recv(1024).decode()\n total_response += part\n # Check if the end of the message is reached\n if large_data in total_response:\n break\n # Assert that the large data string is in the response\n self.assertIn(large_data, total_response)", "apis": ["queue.Queue", "socket.SOCK_STREAM", "socket.AF_INET", "socket.socket", "queue.Empty", "datetime.datetime.now", "select.select", "datetime.timedelta"], "libs": ["queue", "select", "datetime", "socket"], "doc": {"description": ["Run a non-blocking echo server that appends the server's current time to received data and sends it back to the client, while handling exceptional conditions for each socket."], "note": [], "params": ["server_address (str): The address for the server to listen on. Default is 'localhost'.", "server_port (int): The port for the server to listen on. Default is 12345.", "buffer_size (int): The buffer size for data reception. Default is 1024 bytes.", "run_duration (int): The duration (in seconds) for which the server will run. Default is 5 seconds."], "returns": ["str: A status message indicating the server's operation and run duration."], "reqs": ["socket", "select", "queue", "datetime"], "raises": [], "example": [">>> print(f_884())", "'Server started on localhost:12345. Ran for 5 seconds.'"]}} +{"task_id": "f_833", "prompt": "import binascii\nimport base64\nimport urllib.parse\nimport codecs\n\n\ndef f_833(hex_string):\n \"\"\"\n Convert a hexadecimal string to various encodings.\n\n This function takes a hexadecimal string as input and performs several encoding operations. \n Initially, it decodes the hexadecimal string to bytes and then converts these bytes into a UTF-8 string. \n This UTF-8 string is subsequently encoded into different formats: hexadecimal, base64, UTF-8, UTF-16, \n UTF-32, ASCII (if possible), URL encoding, and ROT13. Note that if ASCII not possible, returns 'Not representable in ASCII'.\n\n Parameters:\n - hex_string (str): The input string in hexadecimal format.\n\n Returns:\n - dict: A dictionary containing the input string encoded in various formats. The dictionary's keys\n are the encoding types ('hex', 'base64', 'utf-8', 'utf-16', 'utf-32', 'ASCII', 'URL', 'ROT13'),\n and the values are the corresponding encoded strings. If the string cannot be represented in ASCII,\n the 'ASCII' key maps to 'Not representable in ASCII'.\n\n Requirements:\n - binascii\n - base64\n - urllib\n - codecs\n\n Example:\n >>> f_833(\"4a4b4c\")\n {'hex': '4a4b4c', 'base64': 'SktM', 'utf-8': 'JKL', 'utf-16': 'JKL', 'utf-32': 'JKL', 'ASCII': 'JKL', 'URL': 'JKL', 'ROT13': 'WXY'}\n\n >>> f_833(\"68656c6c6f\")\n {'hex': '68656c6c6f', 'base64': 'aGVsbG8=', 'utf-8': 'hello', 'utf-16': 'hello', 'utf-32': 'hello', 'ASCII': 'hello', 'URL': 'hello', 'ROT13': 'uryyb'}\n \"\"\"", "canonical_solution": " encodings = {}\n\n # Convert hex string to its string representation\n decoded_str = bytes.fromhex(hex_string).decode(\"utf-8\")\n\n # Hexadecimal encoding\n encodings[\"hex\"] = binascii.hexlify(decoded_str.encode()).decode()\n\n # Base64 encoding\n encodings[\"base64\"] = base64.b64encode(decoded_str.encode()).decode()\n\n # UTF-8 encoding\n encodings[\"utf-8\"] = decoded_str.encode(\"utf-8\").decode()\n\n # UTF-16 encoding\n encodings[\"utf-16\"] = decoded_str.encode(\"utf-16\").decode(\"utf-16\")\n\n # UTF-32 encoding\n encodings[\"utf-32\"] = decoded_str.encode(\"utf-32\").decode(\"utf-32\")\n\n # ASCII encoding - only if characters are in ASCII range\n try:\n encodings[\"ASCII\"] = decoded_str.encode(\"ascii\").decode()\n except UnicodeEncodeError:\n encodings[\"ASCII\"] = \"Not representable in ASCII\"\n\n # URL encoding\n encodings[\"URL\"] = urllib.parse.quote(decoded_str)\n\n # ROT13 encoding\n encodings[\"ROT13\"] = codecs.encode(decoded_str, \"rot_13\")\n\n return encodings", "test": "import unittest\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for f_833\"\"\"\n def test_hex_string_sample(self):\n \"\"\"Test the sample input from the problem description.\"\"\"\n hex_str = \"4a4b4c\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"SktM\")\n self.assertEqual(result[\"utf-8\"], \"JKL\")\n self.assertEqual(result[\"utf-16\"], \"JKL\")\n self.assertEqual(result[\"utf-32\"], \"JKL\")\n self.assertEqual(result[\"ASCII\"], \"JKL\")\n self.assertEqual(result[\"URL\"], \"JKL\")\n self.assertEqual(result[\"ROT13\"], \"WXY\")\n def test_hex_string_1(self):\n \"\"\"Test a hex string with a mix of letters and numbers.\"\"\"\n hex_str = \"68656c6c6f\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"aGVsbG8=\")\n self.assertEqual(result[\"utf-8\"], \"hello\")\n self.assertEqual(result[\"utf-16\"], \"hello\")\n self.assertEqual(result[\"utf-32\"], \"hello\")\n self.assertEqual(result[\"ASCII\"], \"hello\")\n self.assertEqual(result[\"URL\"], \"hello\")\n self.assertEqual(result[\"ROT13\"], \"uryyb\")\n def test_hex_string_2(self):\n \"\"\"Test a hex string with a mix of letters and numbers.\"\"\"\n hex_str = \"776f726c64\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"d29ybGQ=\")\n self.assertEqual(result[\"utf-8\"], \"world\")\n self.assertEqual(result[\"utf-16\"], \"world\")\n self.assertEqual(result[\"utf-32\"], \"world\")\n self.assertEqual(result[\"ASCII\"], \"world\")\n self.assertEqual(result[\"URL\"], \"world\")\n self.assertEqual(result[\"ROT13\"], \"jbeyq\")\n def test_hex_string_3(self):\n \"\"\"Test a hex string with a mix of letters and numbers.\"\"\"\n hex_str = \"616263\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"YWJj\")\n self.assertEqual(result[\"utf-8\"], \"abc\")\n self.assertEqual(result[\"utf-16\"], \"abc\")\n self.assertEqual(result[\"utf-32\"], \"abc\")\n self.assertEqual(result[\"ASCII\"], \"abc\")\n self.assertEqual(result[\"URL\"], \"abc\")\n self.assertEqual(result[\"ROT13\"], \"nop\")\n def test_hex_string_4(self):\n \"\"\"Test a hex string with a mix of letters and numbers.\"\"\"\n hex_str = \"313233\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"MTIz\")\n self.assertEqual(result[\"utf-8\"], \"123\")\n self.assertEqual(result[\"utf-16\"], \"123\")\n self.assertEqual(result[\"utf-32\"], \"123\")\n self.assertEqual(result[\"ASCII\"], \"123\")\n self.assertEqual(result[\"URL\"], \"123\")\n self.assertEqual(result[\"ROT13\"], \"123\")\n def test_hex_string_non_ascii(self):\n \"\"\"Test a hex string with non-ASCII characters.\"\"\"\n hex_str = \"c3a9\"\n result = f_833(hex_str)\n self.assertEqual(result[\"hex\"], hex_str)\n self.assertEqual(result[\"base64\"], \"w6k=\")\n self.assertEqual(result[\"utf-8\"], \"\u00e9\")\n self.assertEqual(result[\"utf-16\"], \"\u00e9\")\n self.assertEqual(result[\"utf-32\"], \"\u00e9\")\n self.assertEqual(result[\"ASCII\"], \"Not representable in ASCII\")\n self.assertEqual(result[\"URL\"], \"%C3%A9\")\n self.assertEqual(result[\"ROT13\"], \"\u00e9\")", "apis": ["codecs.encode", "binascii.hexlify", "urllib.parse.quote", "urllib.parse", "base64.b64encode"], "libs": ["base64", "urllib", "binascii", "codecs"], "doc": {"description": ["Convert a hexadecimal string to various encodings.", "This function takes a hexadecimal string as input and performs several encoding operations.", "Initially, it decodes the hexadecimal string to bytes and then converts these bytes into a UTF-8 string.", "This UTF-8 string is subsequently encoded into different formats: hexadecimal, base64, UTF-8, UTF-16,", "UTF-32, ASCII (if possible), URL encoding, and ROT13. Note that if ASCII not possible, returns 'Not representable in ASCII'.", ">>> f_833(\"68656c6c6f\")", "{'hex': '68656c6c6f', 'base64': 'aGVsbG8=', 'utf-8': 'hello', 'utf-16': 'hello', 'utf-32': 'hello', 'ASCII': 'hello', 'URL': 'hello', 'ROT13': 'uryyb'}"], "note": [], "params": ["hex_string (str): The input string in hexadecimal format."], "returns": ["dict: A dictionary containing the input string encoded in various formats. The dictionary's keys", "are the encoding types ('hex', 'base64', 'utf-8', 'utf-16', 'utf-32', 'ASCII', 'URL', 'ROT13'),", "and the values are the corresponding encoded strings. If the string cannot be represented in ASCII,", "the 'ASCII' key maps to 'Not representable in ASCII'."], "reqs": ["binascii", "base64", "urllib", "codecs"], "raises": [], "example": [">>> f_833(\"4a4b4c\")", "{'hex': '4a4b4c', 'base64': 'SktM', 'utf-8': 'JKL', 'utf-16': 'JKL', 'utf-32': 'JKL', 'ASCII': 'JKL', 'URL': 'JKL', 'ROT13': 'WXY'}"]}} +{"task_id": "f_415", "prompt": "import json\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nfrom collections import defaultdict\n\n\ndef f_415(input_file: str) -> plt.Axes:\n \"\"\"\n Read a list of dictionaries from a JSON file, calculate the results (mean and median for each key)\n via numpy, convert the input data into a pandas DataFrame with the keys as \"X\" and values as \"Y\"\n for visualization with a seaborn box plot, then return the results and box plot.\n\n Parameters:\n - input_file (str): The input JSON file name with absolute path.\n\n Returns:\n - results (dict): Dictionary where each key is a unique key from the original input, and each\n value is a corresponding dict, with keys 'mean' and 'median' and the statistics\n as values.\n - ax (plt.Axes): The box plot of aggregated 'Values for Each Key' in the input data.\n\n Requirements:\n - json\n - seaborn\n - matplotlib.pyplot\n - pandas\n - numpy\n - collections.defaultdict\n\n Example:\n >>> results, ax = f_415(\"/path/to/data.json\")\n >>> ax\n \n >>> results\n {'a': {'mean': 3.0, 'median': 3.0}, 'b': {'mean': 2.0, 'median': 3.0}}\n \"\"\"", "canonical_solution": " with open(input_file, \"r\") as f:\n data = json.load(f)\n\n stats = defaultdict(list)\n for d in data:\n for key, value in d.items():\n stats[key].append(value)\n\n results = {\n k: {\"mean\": np.mean(v), \"median\": np.median(v)} for k, v in stats.items()\n }\n\n data = pd.DataFrame(data).melt(var_name=\"X\", value_name=\"Y\")\n ax = sns.boxplot(data=data, x=\"X\", y=\"Y\")\n ax.set_title(\"Boxplot of Values for Each Key\")\n return results, ax", "test": "import unittest\nimport os\nimport tempfile\nimport matplotlib.pyplot as plt\nimport json\nclass TestCases(unittest.TestCase):\n @classmethod\n def setUpClass(cls):\n # Setup a temporary directory and write sample JSON data to a temp file\n cls.temp_dir = tempfile.TemporaryDirectory()\n cls.sample_data_file = os.path.join(cls.temp_dir.name, \"sample_data.json\")\n cls.sample_data = [\n {\"A\": 10, \"B\": 20, \"C\": 30},\n {\"A\": 15, \"B\": 25, \"C\": 35},\n {\"A\": 20, \"B\": 30, \"C\": 40},\n ]\n with open(cls.sample_data_file, \"w\") as f:\n json.dump(cls.sample_data, f)\n # Create an invalid JSON file for testing\n cls.invalid_json_file = os.path.join(cls.temp_dir.name, \"invalid.json\")\n with open(cls.invalid_json_file, \"w\") as f:\n f.write(\"invalid content\")\n @classmethod\n def tearDownClass(cls):\n cls.temp_dir.cleanup()\n plt.close(\"all\")\n def test_case_1(self):\n # Test if the function can read the JSON data file and return a plot\n _, ax = f_415(self.sample_data_file)\n self.assertIsInstance(ax, plt.Axes, \"The function should return a plot (Axes).\")\n self.assertTrue(len(ax.get_xticks()) > 0, \"The plot should have x-axis ticks.\")\n self.assertTrue(len(ax.get_yticks()) > 0, \"The plot should have y-axis ticks.\")\n self.assertTrue(ax.get_title(), \"Boxplot of Values for Each Key\")\n def test_case_2(self):\n # Check result correctness\n results, _ = f_415(self.sample_data_file)\n self.assertIn(\"A\", results)\n self.assertIn(\"B\", results)\n self.assertIn(\"C\", results)\n self.assertEqual(results[\"A\"][\"mean\"], 15.0)\n self.assertEqual(results[\"A\"][\"median\"], 15.0)\n self.assertEqual(results[\"B\"][\"mean\"], 25.0)\n self.assertEqual(results[\"B\"][\"median\"], 25.0)\n self.assertEqual(results[\"C\"][\"mean\"], 35.0)\n self.assertEqual(results[\"C\"][\"median\"], 35.0)\n def test_case_3(self):\n # Test the correctness of the x-axis labels\n _, ax = f_415(self.sample_data_file)\n x_labels = [label.get_text() for label in ax.get_xticklabels()]\n expected_x_labels = [\"A\", \"B\", \"C\"]\n self.assertListEqual(\n x_labels, expected_x_labels, \"The x-axis labels are not as expected.\"\n )\n def test_case_4(self):\n # Test the correctness of the y-axis data points\n _, ax = f_415(self.sample_data_file)\n # Correctly extract the height of the boxes in the box plot\n boxes = [\n box.get_height() for box in ax.containers if hasattr(box, \"get_height\")\n ]\n self.assertTrue(\n all(height > 0 for height in boxes),\n \"Each box plot should have y-data points.\",\n )\n def test_case_5(self):\n # Test if the function raises an error for non-existent file\n with self.assertRaises(FileNotFoundError):\n f_415(os.path.join(self.temp_dir.name, \"non_existent.json\"))\n def test_case_6(self):\n # Test if the function raises an error for invalid JSON format\n with self.assertRaises(json.JSONDecodeError):\n f_415(os.path.join(self.temp_dir.name, \"invalid.json\"))", "apis": ["seaborn.boxplot", "collections.defaultdict", "pandas.DataFrame", "numpy.mean", "numpy.median", "json.load", "matplotlib.pyplot.Axes"], "libs": ["collections", "numpy", "seaborn", "pandas", "json", "matplotlib"], "doc": {"description": ["Read a list of dictionaries from a JSON file, calculate the results (mean and median for each key)", "via numpy, convert the input data into a pandas DataFrame with the keys as \"X\" and values as \"Y\"", "for visualization with a seaborn box plot, then return the results and box plot."], "note": [], "params": ["input_file (str): The input JSON file name with absolute path."], "returns": ["results (dict): Dictionary where each key is a unique key from the original input, and each", "value is a corresponding dict, with keys 'mean' and 'median' and the statistics", "as values.", "ax (plt.Axes): The box plot of aggregated 'Values for Each Key' in the input data."], "reqs": ["json", "seaborn", "matplotlib.pyplot", "pandas", "numpy", "collections.defaultdict"], "raises": [], "example": [">>> results, ax = f_415(\"/path/to/data.json\")", ">>> ax", "", ">>> results", "{'a': {'mean': 3.0, 'median': 3.0}, 'b': {'mean': 2.0, 'median': 3.0}}"]}} +{"task_id": "f_821", "prompt": "import numpy as np\nimport pandas as pd\nfrom sklearn.decomposition import PCA\n\n\ndef f_821(array, seed=None):\n \"\"\"\n Shuffles the columns of a numpy array randomly, performs Principal Component Analysis (PCA)\n to reduce the dimensionality to 2 principal components, and returns these components as a pandas DataFrame.\n\n Parameters:\n - array (numpy.ndarray): A 2D numpy array where each row is an observation and each column is a feature.\n - seed (int, optional): Seed for the random number generator. Defaults to None (not set).\n\n Returns:\n - pandas.DataFrame: DataFrame with columns 'PC1' and 'PC2' representing the two principal components.\n\n Raises:\n - ValueError: If the input array is not 2D.\n\n Requirements:\n - numpy\n - pandas\n - sklearn\n\n Note:\n - PCA reduction will default to the number of features if fewer than 2.\n - An named but empty DataFrame is returned for arrays without features or with empty content.\n\n Examples:\n >>> array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n >>> df = f_821(array, seed=42)\n >>> df\n PC1 PC2\n 0 5.59017 4.440892e-16\n 1 -5.59017 4.440892e-16\n >>> df.shape\n (2, 2)\n \"\"\"", "canonical_solution": " if seed is not None:\n np.random.seed(seed)\n\n if not isinstance(array, np.ndarray) or len(array.shape) != 2:\n raise ValueError(\"Input must be a 2D numpy array.\")\n\n if array.size == 0 or array.shape[1] == 0:\n return pd.DataFrame(columns=[\"PC1\", \"PC2\"])\n\n shuffled_array = np.copy(array)\n np.random.shuffle(np.transpose(shuffled_array))\n\n n_components = min(2, shuffled_array.shape[1])\n pca = PCA(n_components=n_components)\n principal_components = pca.fit_transform(shuffled_array)\n\n column_labels = [\"PC1\", \"PC2\"][:n_components]\n df = pd.DataFrame(data=principal_components, columns=column_labels)\n\n return df", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.array2x5 = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])\n self.array5x1 = np.array([[1], [2], [3], [4], [5]])\n def test_with_empty_array(self):\n \"\"\"Test handling of an empty array.\"\"\"\n array = np.empty((0, 0))\n df = f_821(array, seed=42)\n self.assertTrue(df.empty, \"The returned DataFrame should be empty.\")\n self.assertTrue(\n (df.columns == [\"PC1\", \"PC2\"]).all(),\n \"Column names should be 'PC1' and 'PC2' even for an empty DataFrame.\",\n )\n def test_with_2x5_array(self):\n \"\"\"Test PCA on a 2x5 array with shuffled columns.\"\"\"\n df = f_821(self.array2x5, seed=42)\n self.assertEqual(df.shape, (2, 2), \"DataFrame shape should be (2, 2).\")\n self.assertTrue(\n (df.columns == [\"PC1\", \"PC2\"]).all(),\n \"Column names should be 'PC1' and 'PC2'.\",\n )\n def test_with_5x1_array(self):\n \"\"\"Test PCA on a 5x1 array.\"\"\"\n df = f_821(self.array5x1, seed=0)\n self.assertEqual(\n df.shape, (5, 1), \"DataFrame shape should be (5, 1) for a single component.\"\n )\n self.assertTrue(\n (df.columns == [\"PC1\"]).all(),\n \"Column name should be 'PC1' for a single component.\",\n )\n def test_invalid_input(self):\n \"\"\"Test handling of invalid input.\"\"\"\n with self.assertRaises(ValueError):\n f_821(np.array([1, 2, 3]), seed=42)\n def test_reproducibility(self):\n \"\"\"Test if the function is reproducible with the same seed.\"\"\"\n df1 = f_821(self.array2x5, seed=42)\n df2 = f_821(self.array2x5, seed=42)\n pd.testing.assert_frame_equal(\n df1, df2, \"Results should be identical when using the same seed.\"\n )\n def test_pca_correctness(self):\n \"\"\"\n Test PCA correctness by ensuring that the variance is captured correctly\n in the principal components.\n \"\"\"\n # Creating a simple array where variance is higher in one dimension\n # This dataset is designed so that the first principal component should\n # capture the majority of the variance.\n array = np.array(\n [\n [1, 2, 3, 4, 5],\n [1, 2, 3, 4, 5],\n [1, 2, 3, 4, 5],\n [1, 2, 3, 4, 5],\n [10, 10, 10, 10, 10],\n ]\n ) # Increased variance in the last row\n df = f_821(array, seed=0)\n # The PCA should be able to capture the variance in the first principal component\n # significantly more than in the second, if applicable.\n # Asserting that the first PC values are not all the same,\n # which indicates it captured the variance.\n self.assertFalse(\n df[\"PC1\"].std() == 0,\n \"PCA should capture variance along the first principal component.\",\n )", "apis": ["numpy.random.shuffle", "pandas.DataFrame", "numpy.random", "numpy.transpose", "numpy.ndarray", "numpy.copy", "numpy.random.seed", "sklearn.decomposition.PCA"], "libs": ["pandas", "numpy", "sklearn"], "doc": {"description": ["Shuffles the columns of a numpy array randomly, performs Principal Component Analysis (PCA)", "to reduce the dimensionality to 2 principal components, and returns these components as a pandas DataFrame."], "note": ["PCA reduction will default to the number of features if fewer than 2.", "An named but empty DataFrame is returned for arrays without features or with empty content."], "params": ["array (numpy.ndarray): A 2D numpy array where each row is an observation and each column is a feature.", "seed (int, optional): Seed for the random number generator. Defaults to None (not set)."], "returns": ["pandas.DataFrame: DataFrame with columns 'PC1' and 'PC2' representing the two principal components."], "reqs": ["numpy", "pandas", "sklearn"], "raises": ["ValueError: If the input array is not 2D."], "example": ["Examples:", ">>> array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])", ">>> df = f_821(array, seed=42)", ">>> df", "PC1 PC2", "0 5.59017 4.440892e-16", "1 -5.59017 4.440892e-16", ">>> df.shape", "(2, 2)"]}} +{"task_id": "f_393", "prompt": "from datetime import datetime, timedelta\nimport numpy as np\nimport matplotlib.pyplot as plt\n\ndef f_393(days_in_past=7, random_seed=0):\n \"\"\"\n Draw a graph of temperature trends over the past week using randomly generated data.\n\n This function generates random integer temperatures in Celcius with a low of 15 and high of 35.\n To show temperature trend, it plots date on the x-axis and temperature on the y-axis.\n\n Parameters:\n days_in_past (int, optional): The number of days in the past for which to generate the graph.\n Defaults to 7 days.\n random_seed (int, optional): Seed for random number generation. Defaults to 0.\n\n Returns:\n ax (matplotlib.axes._axes.Axes): Generated plot showing 'Temperature Trends Over the Past Week',\n with 'Date' on the a-xis and 'Temperature (\u00b0C)' on the y-axis.\n\n\n Requirements:\n - datetime.datetime\n - datetime.timedelta\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> ax = f_393(random_seed=42)\n >>> type(ax)\n \n >>> ax.get_xticklabels()\n [Text(19810.0, 0, '2024-03-28'), Text(19811.0, 0, '2024-03-29'), Text(19812.0, 0, '2024-03-30'), Text(19813.0, 0, '2024-03-31'), Text(19814.0, 0, '2024-04-01'), Text(19815.0, 0, '2024-04-02'), Text(19816.0, 0, '2024-04-03')]\n \"\"\"", "canonical_solution": " np.random.seed(random_seed)\n\n if days_in_past < 1:\n raise ValueError(\"days_in_past must be in the past\")\n\n dates = [datetime.now().date() - timedelta(days=i) for i in range(days_in_past)]\n temperatures = np.random.randint(low=15, high=35, size=days_in_past)\n\n fig, ax = plt.subplots()\n ax.plot(dates, temperatures)\n ax.set_xlabel(\"Date\")\n ax.set_ylabel(\"Temperature (\u00b0C)\")\n ax.set_title(\"Temperature Trend\")\n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def _test_plot(self, ax):\n self.assertIsInstance(ax, plt.Axes)\n self.assertEqual(ax.get_xlabel(), \"Date\")\n self.assertEqual(ax.get_ylabel(), \"Temperature (\u00b0C)\")\n self.assertEqual(ax.get_title(), \"Temperature Trend\")\n def test_case_1(self):\n # Test default parameters\n ax = f_393()\n self._test_plot(ax)\n def test_case_2(self):\n # Test days in the past\n for n_days in [1, 5, 50, 100]:\n ax = f_393(n_days, random_seed=2)\n self._test_plot(ax)\n self.assertEqual(len(ax.lines[0].get_ydata()), n_days)\n def test_case_3(self):\n # Test handling invalid days in the past\n with self.assertRaises(Exception):\n f_393(0, random_seed=4)\n def test_case_4(self):\n # Test handling invalid days in the past\n with self.assertRaises(Exception):\n f_393(-1, random_seed=4)\n def test_case_5(self):\n # Test random seed reproducibility\n ax1 = f_393(5, random_seed=42)\n ax2 = f_393(5, random_seed=42)\n self.assertTrue(\n np.array_equal(ax1.lines[0].get_ydata(), ax2.lines[0].get_ydata())\n )\n def test_case_6(self):\n # Test random seed difference\n ax1 = f_393(5, random_seed=0)\n ax2 = f_393(5, random_seed=42)\n self.assertFalse(\n np.array_equal(ax1.lines[0].get_ydata(), ax2.lines[0].get_ydata())\n )\n def tearDown(self):\n plt.close(\"all\")", "apis": ["numpy.random.randint", "numpy.random", "datetime.datetime.now", "matplotlib.pyplot.subplots", "datetime.timedelta", "numpy.random.seed"], "libs": ["numpy", "matplotlib", "datetime"], "doc": {"description": ["Draw a graph of temperature trends over the past week using randomly generated data.", "This function generates random integer temperatures in Celcius with a low of 15 and high of 35.", "To show temperature trend, it plots date on the x-axis and temperature on the y-axis."], "note": [], "params": ["days_in_past (int, optional): The number of days in the past for which to generate the graph.", "Defaults to 7 days.", "random_seed (int, optional): Seed for random number generation. Defaults to 0."], "returns": ["ax (matplotlib.axes._axes.Axes): Generated plot showing 'Temperature Trends Over the Past Week',", "with 'Date' on the a-xis and 'Temperature (\u00b0C)' on the y-axis."], "reqs": ["datetime.datetime", "datetime.timedelta", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> ax = f_393(random_seed=42)", ">>> type(ax)", "", ">>> ax.get_xticklabels()", "[Text(19810.0, 0, '2024-03-28'), Text(19811.0, 0, '2024-03-29'), Text(19812.0, 0, '2024-03-30'), Text(19813.0, 0, '2024-03-31'), Text(19814.0, 0, '2024-04-01'), Text(19815.0, 0, '2024-04-02'), Text(19816.0, 0, '2024-04-03')]"]}} +{"task_id": "f_878", "prompt": "import pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import confusion_matrix\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\ndef f_878(feature: pd.Series, target: pd.Series) -> (np.ndarray, plt.Axes):\n \"\"\"\n Train a logistic regression model on one feature and evaluate its performance using a confusion matrix plot.\n The function takes a feature and a target series, splits them into training and testing sets, trains the logistic\n regression model, predicts the target for the test set, and plots the confusion matrix.\n\n Parameters:\n feature (pd.Series): Series representing the single feature for the logistic regression model.\n target (pd.Series): Series representing the target variable.\n\n Returns:\n (np.ndarray, plt.Axes): A tuple containing the confusion matrix and the matplotlib Axes object of the confusion matrix plot.\n\n Requirements:\n - pandas\n - sklearn.model_selection.train_test_split\n - sklearn.linear_model.LogisticRegression\n - sklearn.metrics.confusion_matrix\n - numpy\n - matplotlib.pyplot\n\n Example:\n >>> feature = pd.Series(np.random.rand(1000)) # Feature data\n >>> target = pd.Series(np.random.randint(0, 2, size=1000)) # Target data (binary)\n >>> cm, ax = f_878(feature, target)\n >>> ax.get_title()\n 'Confusion Matrix'\n \"\"\"", "canonical_solution": " # Create DataFrame from the series\n df = pd.DataFrame({\"Feature\": feature, \"Target\": target})\n\n # Split the data into train and test datasets\n X_train, X_test, y_train, y_test = train_test_split(\n df[\"Feature\"], df[\"Target\"], test_size=0.2, random_state=42\n )\n\n # Initialize and train the Logistic Regression model\n model = LogisticRegression()\n model.fit(X_train.values.reshape(-1, 1), y_train)\n\n # Make predictions\n y_pred = model.predict(X_test.values.reshape(-1, 1))\n\n # Compute the confusion matrix\n cm = confusion_matrix(y_test, y_pred)\n\n # Plot the confusion matrix\n _, ax = plt.subplots()\n cax = ax.matshow(cm, cmap=\"Blues\")\n plt.title(\"Confusion Matrix\")\n plt.xlabel(\"Predicted\")\n plt.ylabel(\"Actual\")\n plt.colorbar(cax)\n\n # Setting tick locations\n ax.set_xticks([0, 1])\n ax.set_yticks([0, 1])\n\n # Now set tick labels correctly\n ax.set_xticklabels([\"No\", \"Yes\"])\n ax.set_yticklabels([\"No\", \"Yes\"])\n\n return cm, ax", "test": "import unittest\nimport pandas as pd\nimport numpy as np\nfrom matplotlib.axes import Axes\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function f_878.\"\"\"\n def test_with_random_data(self):\n \"\"\"\n Test the function with random data to ensure normal functionality.\n \"\"\"\n np.random.seed(42)\n feature = pd.Series(np.random.rand(100))\n np.random.seed(42)\n target = pd.Series(np.random.randint(0, 2, size=100))\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def test_with_all_zeroes(self):\n \"\"\"\n Test the function with all zeroes in the feature set.\n \"\"\"\n feature = pd.Series(np.zeros(100))\n np.random.seed(123)\n target = pd.Series(np.random.randint(0, 2, size=100))\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def test_with_all_ones(self):\n \"\"\"\n Test the function with all ones in the feature set.\n \"\"\"\n feature = pd.Series(np.ones(100))\n np.random.seed(42)\n target = pd.Series(np.random.randint(0, 2, size=100))\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def test_with_perfect_correlation(self):\n \"\"\"\n Test the function when the feature perfectly predicts the target.\n \"\"\"\n np.random.seed(123)\n feature = pd.Series(np.random.rand(100))\n target = feature.round()\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def test_with_no_correlation(self):\n \"\"\"\n Test the function when there is no correlation between feature and target.\n \"\"\"\n np.random.seed(42)\n feature = pd.Series(np.random.rand(100))\n np.random.seed(42)\n target = pd.Series(np.random.choice([0, 1], size=100))\n cm, ax = f_878(feature, target)\n self.assertIsInstance(cm, np.ndarray)\n self.assertIsInstance(ax, Axes)\n def tearDown(self):\n plt.close()", "apis": ["pandas.Series", "sklearn.linear_model.LogisticRegression", "pandas.DataFrame", "matplotlib.pyplot.colorbar", "sklearn.metrics.confusion_matrix", "numpy.ndarray", "matplotlib.pyplot.title", "matplotlib.pyplot.subplots", "sklearn.model_selection.train_test_split", "matplotlib.pyplot.ylabel", "matplotlib.pyplot.Axes", "matplotlib.pyplot.xlabel"], "libs": ["matplotlib", "numpy", "pandas", "sklearn"], "doc": {"description": ["Train a logistic regression model on one feature and evaluate its performance using a confusion matrix plot.", "The function takes a feature and a target series, splits them into training and testing sets, trains the logistic", "regression model, predicts the target for the test set, and plots the confusion matrix."], "note": [], "params": ["feature (pd.Series): Series representing the single feature for the logistic regression model.", "target (pd.Series): Series representing the target variable."], "returns": ["(np.ndarray, plt.Axes): A tuple containing the confusion matrix and the matplotlib Axes object of the confusion matrix plot."], "reqs": ["pandas", "sklearn.model_selection.train_test_split", "sklearn.linear_model.LogisticRegression", "sklearn.metrics.confusion_matrix", "numpy", "matplotlib.pyplot"], "raises": [], "example": [">>> feature = pd.Series(np.random.rand(1000)) # Feature data", ">>> target = pd.Series(np.random.randint(0, 2, size=1000)) # Target data (binary)", ">>> cm, ax = f_878(feature, target)", ">>> ax.get_title()", "'Confusion Matrix'"]}} {"task_id": "f_863", "prompt": "import json\nimport requests\nimport chardet\n\n# Constants\nAPI_URL = \"http://api.example.com/data\"\n\n\ndef f_863(url=API_URL, from_encoding=None, to_encoding=\"utf8\"):\n \"\"\"\n Fetches data from a specified REST API URL and processes it for JSON parsing. The process involves decoding\n and re-encoding the data, handling different encoding scenarios.\n\n Note:\n - The function initiates an HTTP GET request to the specified URL with a 5-second timeout. It retrieves the response\n content in raw bytes.\n\n\n Parameters:\n - url (str): The URL of the REST API. Default is 'http://api.example.com/data'.\n - from_encoding (str, optional): The original encoding of the data. If None, encoding is auto-detected. Default is None.\n - to_encoding (str): The target encoding format for the data. Default is 'utf8'.\n\n Returns:\n - dict: The JSON-parsed data after re-encoding. Returns an empty dictionary if the content is empty.\n\n Raises:\n - ValueError: \"Unable to detect encoding for non-empty content\", if it fails to detect the encoding for non-empty response content.\n\n Requirements:\n - json\n - requests\n - chardet\n\n Example:\n >>> data = f_863('http://api.example.com/data')\n >>> print(data)\n {'key': 'value'} # Example of expected output\n\n Notes:\n - The function sets a timeout of 5 seconds for the API request.\n - It handles cases with empty content and undetectable encoding by returning an empty dictionary or raising an exception, respectively.\n - The decoding and re-encoding steps ensure compatibility with various data formats and the final JSON parsing.\n \"\"\"", "canonical_solution": " response = requests.get(url, timeout=5)\n content = response.content\n\n if from_encoding is None:\n detected_encoding = chardet.detect(content)[\"encoding\"]\n # Handling the case where detected encoding is None\n if detected_encoding is None:\n if content:\n raise ValueError(\"Unable to detect encoding for non-empty content\")\n else:\n # Handle empty content gracefully\n return {}\n content = content.decode(detected_encoding)\n else:\n content = content.decode(from_encoding)\n\n content = content.encode(to_encoding).decode(to_encoding)\n\n data = json.loads(content)\n\n return data", "test": "import unittest\nimport json\nimport requests\nfrom unittest import mock\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the function.\"\"\"\n @mock.patch(\"requests.get\")\n @mock.patch(\"chardet.detect\")\n def test_get_data_with_default_parameters(self, mock_detect, mock_get):\n \"\"\"Test that the function works with default parameters and automatically detects encoding.\"\"\"\n response_content = '{\"key\": \"value\"}'.encode(\"cp1251\")\n mock_get.return_value.content = response_content\n mock_detect.return_value = {\"encoding\": \"cp1251\"}\n result = f_863()\n expected_output = {\"key\": \"value\"}\n self.assertEqual(result, expected_output)\n @mock.patch(\"requests.get\")\n def test_get_data_with_custom_url_and_encodings(self, mock_get):\n \"\"\"Test that the function can handle custom URL and specified encodings.\"\"\"\n response_content = '{\"message\": \"success\"}'.encode(\"latin1\")\n mock_get.return_value.content = response_content\n result = f_863(\n url=\"http://custom.url/api\", from_encoding=\"latin1\", to_encoding=\"utf8\"\n )\n expected_output = {\"message\": \"success\"}\n self.assertEqual(result, expected_output)\n @mock.patch(\"requests.get\")\n def test_get_data_with_empty_response(self, mock_get):\n \"\"\"Test that the function returns an empty dictionary when the response content is empty.\"\"\"\n mock_get.return_value.content = b\"\"\n result = f_863()\n expected_output = {}\n self.assertEqual(result, expected_output)\n @mock.patch(\"requests.get\")\n def test_get_data_with_invalid_json(self, mock_get):\n \"\"\"Test that the function raises an error when the response content is not valid JSON.\"\"\"\n response_content = b\"{invalid json content}\"\n mock_get.return_value.content = response_content\n with self.assertRaises(json.JSONDecodeError):\n f_863()\n @mock.patch(\"requests.get\")\n def test_get_data_with_different_valid_encoding(self, mock_get):\n \"\"\"Test that the function can handle different specified encodings.\"\"\"\n response_content = '{\"text\": \"\u3053\u3093\u306b\u3061\u306f\"}'.encode(\"utf8\")\n mock_get.return_value.content = response_content\n result = f_863(from_encoding=\"utf8\", to_encoding=\"utf8\")\n expected_output = {\"text\": \"\u3053\u3093\u306b\u3061\u306f\"}\n self.assertEqual(result, expected_output)\n @mock.patch(\"requests.get\")\n @mock.patch(\"chardet.detect\")\n def test_get_data_with_undetectable_encoding(self, mock_detect, mock_get):\n \"\"\"Test that the function raises ValueError when encoding cannot be detected for non-empty content.\"\"\"\n # Mocking response content as non-empty and undetectable encoding\n response_content = b\"Some non-empty content\"\n mock_get.return_value.content = response_content\n mock_detect.return_value = {\"encoding\": None}\n with self.assertRaises(ValueError) as context:\n f_863()\n # Asserting that the correct ValueError is raised\n self.assertTrue(\n \"Unable to detect encoding for non-empty content\" in str(context.exception)\n )", "apis": ["chardet.detect", "json.loads", "requests.get"], "libs": ["json", "chardet", "requests"], "doc": {"description": ["Fetches data from a specified REST API URL and processes it for JSON parsing. The process involves decoding", "and re-encoding the data, handling different encoding scenarios.", "Notes:", "- The function sets a timeout of 5 seconds for the API request.", "- It handles cases with empty content and undetectable encoding by returning an empty dictionary or raising an exception, respectively.", "- The decoding and re-encoding steps ensure compatibility with various data formats and the final JSON parsing."], "note": ["The function initiates an HTTP GET request to the specified URL with a 5-second timeout. It retrieves the response", "content in raw bytes."], "params": ["url (str): The URL of the REST API. Default is 'http://api.example.com/data'.", "from_encoding (str, optional): The original encoding of the data. If None, encoding is auto-detected. Default is None.", "to_encoding (str): The target encoding format for the data. Default is 'utf8'."], "returns": ["dict: The JSON-parsed data after re-encoding. Returns an empty dictionary if the content is empty."], "reqs": ["json", "requests", "chardet"], "raises": ["ValueError: \"Unable to detect encoding for non-empty content\", if it fails to detect the encoding for non-empty response content."], "example": [">>> data = f_863('http://api.example.com/data')", ">>> print(data)", "{'key': 'value'} # Example of expected output"]}} {"task_id": "f_558", "prompt": "import numpy as np\nfrom scipy import stats\n\ndef f_558(df):\n \"\"\"\n Given a Pandas DataFrame with random numeric values test if the data in each column is normally distributed using the Shapiro-Wilk test.\n\n Parameters:\n - df (DataFrame): A Pandas DataFrame with random numeric values.\n \n Returns:\n - dict: A dictionary with p-values from the Shapiro-Wilk test for each column.\n\n Requirements:\n - numpy\n - scipy\n\n Example:\n >>> np.random.seed(42)\n >>> df = pd.DataFrame(np.random.normal(size=(100, 5)))\n >>> p_values = f_558(df)\n >>> print(p_values)\n {0: 0.3595593273639679, 1: 0.23594242334365845, 2: 0.7625704407691956, 3: 0.481273353099823, 4: 0.13771861791610718}\n \"\"\"", "canonical_solution": "\n p_values = {}\n\n for col in df.columns:\n column_data = np.array(df[col])\n \n test_stat, p_value = stats.shapiro(column_data)\n \n p_values[col] = p_value\n\n return p_values", "test": "import unittest\nimport pandas as pd\nclass TestCases(unittest.TestCase):\n def setUp(self):\n np.random.seed(42)\n \n def test_case_1(self):\n df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})\n p_values = f_558(df)\n self.assertEqual(len(p_values), 2)\n self.assertTrue('a' in p_values)\n self.assertTrue('b' in p_values)\n self.assertTrue(p_values['a'] > 0.05)\n self.assertTrue(p_values['b'] > 0.05)\n def test_case_2(self):\n df = pd.DataFrame({'a': [-1, 0, 1], 'b': [4, 5, 6]})\n p_values = f_558(df)\n self.assertEqual(len(p_values), 2)\n self.assertTrue('a' in p_values)\n self.assertTrue('b' in p_values)\n self.assertTrue(p_values['a'] > 0.05)\n self.assertTrue(p_values['b'] > 0.05)\n def test_case_3(self):\n df = pd.DataFrame(np.random.normal(size=(100, 5)))\n p_values = f_558(df)\n self.assertEqual(len(p_values), 5)\n for col in df.columns:\n self.assertTrue(col in p_values)\n self.assertTrue(p_values[col] > 0.05)\n def test_case_4(self):\n df = pd.DataFrame(np.random.normal(size=(100, 5)))\n df['a'] = np.random.uniform(size=100)\n p_values = f_558(df)\n self.assertEqual(len(p_values), 6)\n for col in df.columns:\n self.assertTrue(col in p_values)\n if col == 'a':\n self.assertTrue(p_values[col] < 0.05)\n else:\n self.assertTrue(p_values[col] > 0.05)\n def test_case_5(self):\n df = pd.DataFrame(np.random.normal(size=(100, 5)))\n df['a'] = np.random.uniform(size=100)\n df['b'] = np.random.uniform(size=100)\n p_values = f_558(df)\n self.assertEqual(len(p_values), 7)\n for col in df.columns:\n self.assertTrue(col in p_values)\n if col in ['a', 'b']:\n self.assertTrue(p_values[col] < 0.05)\n else:\n self.assertTrue(p_values[col] > 0.05)", "apis": ["scipy.stats.shapiro", "numpy.array"], "libs": ["numpy", "scipy"], "doc": {"description": ["Given a Pandas DataFrame with random numeric values test if the data in each column is normally distributed using the Shapiro-Wilk test."], "note": [], "params": ["df (DataFrame): A Pandas DataFrame with random numeric values."], "returns": ["dict: A dictionary with p-values from the Shapiro-Wilk test for each column."], "reqs": ["numpy", "scipy"], "raises": [], "example": [">>> np.random.seed(42)", ">>> df = pd.DataFrame(np.random.normal(size=(100, 5)))", ">>> p_values = f_558(df)", ">>> print(p_values)", "{0: 0.3595593273639679, 1: 0.23594242334365845, 2: 0.7625704407691956, 3: 0.481273353099823, 4: 0.13771861791610718}"]}} -{"task_id": "f_901", "prompt": "import pandas as pd\nimport itertools\nimport numpy as np\n\n\ndef f_901(animals=None, foods=None):\n \"\"\"\n Create a DataFrame with combinations of animals and foods in a 'animal:food' format.\n\n Parameters:\n - animals (list of str, optional): A list of animal names. If not provided, \n defaults to a predefined list of common animals including 'Dog', 'Cat', 'Elephant', 'Tiger', 'Lion', 'Zebra', 'Giraffe', 'Bear', 'Monkey', 'Kangaroo'.\n - foods (list of str, optional): A list of food names. If not provided, \n defaults to a predefined list of common foods including 'Meat', 'Fish', 'Grass', 'Fruits', 'Insects', 'Seeds', 'Leaves'.\n\n Returns:\n - df (pandas.DataFrame): A DataFrame where each row represents a unique animal from the 'animals' \n list and each column represents a food item from the 'foods' list. Each cell contains a string in the format 'animal:food'.\n\n Handling of Special Cases:\n - If both 'animals' and 'foods' lists are empty or not provided, the function returns an empty DataFrame.\n - If either 'animals' or 'foods' list is empty or not provided, the function uses its predefined list for the missing parameter.\n\n Requirements:\n - pandas\n - numpy\n - itertools\n\n Example:\n >>> animal_food_pairs = f_901(['Dog', 'Cat'], ['Meat', 'Fish'])\n >>> print(animal_food_pairs)\n Meat Fish\n 0 Dog:Meat Dog:Fish\n 1 Cat:Meat Cat:Fish\n\n Note:\n - The function generates all possible combinations of the provided 'animals' and 'foods' using itertools.product.\n - The resulting pairs are shuffled randomly to ensure variety in the DataFrame layout.\n \"\"\"", "canonical_solution": "\n # Default lists if not provided\n if animals is None:\n animals = [\n \"Dog\",\n \"Cat\",\n \"Elephant\",\n \"Tiger\",\n \"Lion\",\n \"Zebra\",\n \"Giraffe\",\n \"Bear\",\n \"Monkey\",\n \"Kangaroo\",\n ]\n if foods is None:\n foods = [\"Meat\", \"Fish\", \"Grass\", \"Fruits\", \"Insects\", \"Seeds\", \"Leaves\"]\n\n # Handling edge case of empty lists\n if not animals or not foods:\n return pd.DataFrame()\n\n pairs = [f\"{a}:{f}\" for a, f in itertools.product(animals, foods)]\n\n # Reshape the data and create a DataFrame\n data = np.array(pairs).reshape(-1, len(foods))\n df = pd.DataFrame(data, columns=foods)\n\n return df", "test": "import unittest\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_901.\"\"\"\n def test_default_input(self):\n \"\"\"Test with default inputs for animals and foods.\"\"\"\n random.seed(0)\n # Scenario: Testing with default inputs for animals and foods\n result = f_901()\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (10, 7),\n \"The shape of the DataFrame with default inputs is not as expected.\",\n )\n def test_custom_input(self):\n \"\"\"Test with custom inputs for animals and foods.\"\"\"\n random.seed(1)\n # Scenario: Testing with custom lists of animals and foods\n animals = [\"Dog\", \"Cat\", \"Elephant\"]\n foods = [\"Meat\", \"Fish\", \"Grass\", \"Fruits\"]\n result = f_901(animals, foods)\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (3, 4),\n \"The shape of the DataFrame with custom inputs is not as expected.\",\n )\n def test_empty_input(self):\n \"\"\"Test with empty lists for animals and foods.\"\"\"\n random.seed(2)\n # Scenario: Testing with empty lists for animals and foods\n animals = []\n foods = []\n result = f_901(animals, foods)\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (0, 0),\n \"The shape of the DataFrame with empty inputs is not as expected.\",\n )\n def test_single_input(self):\n \"\"\"Test with a single animal and a single food.\"\"\"\n random.seed(3)\n # Scenario: Testing with a single animal and a single food\n animals = [\"Dog\"]\n foods = [\"Meat\"]\n result = f_901(animals, foods)\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (1, 1),\n \"The shape of the DataFrame with a single input is not as expected.\",\n )\n # Check if the pairs are correct\n self.assertIn(\n \"Dog:Meat\",\n result.values,\n \"The expected pair 'Dog:Meat' was not found in the resulting DataFrame.\",\n )\n def test_partial_default(self):\n \"\"\"Test with a custom list of animals and default list of foods.\"\"\"\n random.seed(4)\n # Scenario: Testing with a custom list of animals and default list of foods\n animals = [\"Dog\", \"Cat\", \"Elephant\"]\n result = f_901(animals)\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (3, 7),\n \"The shape of the DataFrame with partial default inputs is not as expected.\",\n )", "apis": ["numpy.array", "pandas.DataFrame", "itertools.product"], "libs": ["numpy", "pandas", "itertools"], "doc": {"description": ["Create a DataFrame with combinations of animals and foods in a 'animal:food' format.", "Handling of Special Cases:", "- If both 'animals' and 'foods' lists are empty or not provided, the function returns an empty DataFrame.", "- If either 'animals' or 'foods' list is empty or not provided, the function uses its predefined list for the missing parameter."], "note": ["The function generates all possible combinations of the provided 'animals' and 'foods' using itertools.product.", "The resulting pairs are shuffled randomly to ensure variety in the DataFrame layout."], "params": ["animals (list of str, optional): A list of animal names. If not provided,", "defaults to a predefined list of common animals including 'Dog', 'Cat', 'Elephant', 'Tiger', 'Lion', 'Zebra', 'Giraffe', 'Bear', 'Monkey', 'Kangaroo'.", "foods (list of str, optional): A list of food names. If not provided,", "defaults to a predefined list of common foods including 'Meat', 'Fish', 'Grass', 'Fruits', 'Insects', 'Seeds', 'Leaves'."], "returns": ["df (pandas.DataFrame): A DataFrame where each row represents a unique animal from the 'animals'", "list and each column represents a food item from the 'foods' list. Each cell contains a string in the format 'animal:food'."], "reqs": ["pandas", "numpy", "itertools"], "raises": [], "example": [">>> animal_food_pairs = f_901(['Dog', 'Cat'], ['Meat', 'Fish'])", ">>> print(animal_food_pairs)", "Meat Fish", "0 Dog:Meat Dog:Fish", "1 Cat:Meat Cat:Fish"]}} -{"task_id": "f_881", "prompt": "import pandas as pd\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\n\n\ndef f_881(s1, s2, n_clusters=3):\n \"\"\"\n Perform K-Means clustering on data points from two pandas Series and visualize the clusters.\n\n Parameters:\n - s1 (pandas.Series): The first series of data. Each value in the series represents a data point's coordinate along one dimension.\n - s2 (pandas.Series): The second series of data. Each value corresponds to a data point's coordinate along another dimension. The length of s2 must match that of s1.\n - n_clusters (int, optional): The number of clusters to form as well as the number of centroids to generate. Defaults to 3.\n\n Returns:\n - tuple: A tuple containing the following elements:\n - ndarray: An array of cluster labels indicating the cluster each data point belongs to.\n - matplotlib.axes.Axes: The Axes object of the plot, which shows the data points colored according to their cluster labels.\n\n Raises:\n - ValueError: If either s1 or s2 is not a pandas Series, raise \"s1 and s2 must be pandas Series\"\n - ValueError: If s1 and s2 have different lengths, raise \"s1 and s2 must have the same length\"\n\n Requirements:\n - pandas\n - scikit-learn\n - matplotlib\n\n Notes:\n - The function needs to ensure that s1 and s2 are pandas Series of equal length. \n - It then performs K-Means clustering on the combined data points from s1 and s2. \n - After clustering, it creates a scatter plot where each cluster is visualized with a different color. \n - The plot title is set to \"K-Means Clustering\" to describe the visualization technique. \n - A legend is added, which uses elements from the scatter plot to describe each cluster.\n \n Example:\n >>> s1 = pd.Series(np.random.rand(100), name='feature1')\n >>> s2 = pd.Series(np.random.rand(100), name='feature2')\n >>> labels, ax = f_881(s1, s2, n_clusters=4)\n >>> print(ax.get_title())\n K-Means Clustering\n\n \n \"\"\"", "canonical_solution": " if not isinstance(s1, pd.Series) or not isinstance(s2, pd.Series):\n raise ValueError(\"s1 and s2 must be pandas Series\")\n\n if len(s1) != len(s2):\n raise ValueError(\"s1 and s2 must have the same length\")\n\n # Create a DataFrame from the series\n df = pd.concat([s1, s2], axis=1)\n\n # Perform K-Means clustering\n kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)\n labels = kmeans.fit_predict(df)\n\n # Visualize the clusters\n _, ax = plt.subplots()\n scatter = ax.scatter(df[s1.name], df[s2.name], c=labels)\n ax.set_xlabel(s1.name)\n ax.set_ylabel(s2.name)\n ax.set_title(\"K-Means Clustering\")\n plt.legend(*scatter.legend_elements(), title=\"Clusters\")\n\n return labels, ax", "test": "import pandas as pd\nimport numpy as np\nimport unittest\nimport os\nfrom sklearn.datasets import make_blobs\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_881.\"\"\"\n def setUp(self) -> None:\n os.environ[\"LOKY_MAX_CPU_COUNT\"] = \"2\"\n def test_random_data_size_100(self):\n \"\"\"Test with random data of size 100 and default number of clusters\"\"\"\n np.random.seed(42)\n s1 = pd.Series(np.random.rand(100), name=\"feature1\")\n np.random.seed(0)\n s2 = pd.Series(np.random.rand(100), name=\"feature2\")\n labels, ax = f_881(s1, s2)\n # Check if labels are ndarray\n self.assertIsInstance(labels, np.ndarray)\n # Check the plot's title\n self.assertEqual(ax.get_title(), \"K-Means Clustering\")\n def test_random_data_custom_clusters(self):\n \"\"\"Test with random data of size 100 and custom number of clusters\"\"\"\n np.random.seed(42)\n s1 = pd.Series(np.random.rand(100), name=\"feature1\")\n np.random.seed(0)\n s2 = pd.Series(np.random.rand(100), name=\"feature2\")\n labels, ax = f_881(s1, s2, n_clusters=5)\n # Check if labels are ndarray\n self.assertIsInstance(labels, np.ndarray)\n self.assertEqual(len(set(labels)), 5)\n # Check the plot's title\n self.assertEqual(ax.get_title(), \"K-Means Clustering\")\n def test_invalid_input_non_series(self):\n \"\"\"Test with invalid input types (non-Series)\"\"\"\n with self.assertRaises(ValueError):\n f_881([1, 2, 3], pd.Series([4, 5, 6]))\n def test_invalid_input_mismatched_length(self):\n \"\"\"Test with mismatched length of Series\"\"\"\n s1 = pd.Series([1, 2, 3], name=\"feature1\")\n s2 = pd.Series([4, 5], name=\"feature2\")\n with self.assertRaises(ValueError):\n f_881(s1, s2)\n def test_custom_clusters_with_synthetic_data(self):\n \"\"\"Test with synthetic data and custom number of clusters using make_blobs\"\"\"\n # Generate synthetic data with 2 distinct clusters\n X, _ = make_blobs(n_samples=100, centers=2, random_state=42)\n # Convert to pandas Series\n s1 = pd.Series(X[:, 0], name=\"feature1\")\n s2 = pd.Series(X[:, 1], name=\"feature2\")\n # Run the clustering function\n labels, ax = f_881(s1, s2, n_clusters=2)\n # Check if labels are ndarray\n self.assertIsInstance(labels, np.ndarray)\n # Check the number of unique labels (should be 2 for 2 clusters)\n self.assertEqual(len(set(labels)), 2)\n # Check the plot's title\n self.assertEqual(ax.get_title(), \"K-Means Clustering\")\n def tearDown(self):\n plt.clf()", "apis": ["pandas.concat", "sklearn.cluster.KMeans", "pandas.Series", "matplotlib.pyplot.legend", "matplotlib.pyplot.subplots"], "libs": ["matplotlib", "pandas", "sklearn"], "doc": {"description": ["Perform K-Means clustering on data points from two pandas Series and visualize the clusters.", "Notes:", "- The function needs to ensure that s1 and s2 are pandas Series of equal length.", "- It then performs K-Means clustering on the combined data points from s1 and s2.", "- After clustering, it creates a scatter plot where each cluster is visualized with a different color.", "- The plot title is set to \"K-Means Clustering\" to describe the visualization technique.", "- A legend is added, which uses elements from the scatter plot to describe each cluster."], "note": [], "params": ["s1 (pandas.Series): The first series of data. Each value in the series represents a data point's coordinate along one dimension.", "s2 (pandas.Series): The second series of data. Each value corresponds to a data point's coordinate along another dimension. The length of s2 must match that of s1.", "n_clusters (int, optional): The number of clusters to form as well as the number of centroids to generate. Defaults to 3."], "returns": ["tuple: A tuple containing the following elements:", "ndarray: An array of cluster labels indicating the cluster each data point belongs to.", "matplotlib.axes.Axes: The Axes object of the plot, which shows the data points colored according to their cluster labels."], "reqs": ["pandas", "scikit-learn", "matplotlib"], "raises": ["ValueError: If either s1 or s2 is not a pandas Series, raise \"s1 and s2 must be pandas Series\"", "ValueError: If s1 and s2 have different lengths, raise \"s1 and s2 must have the same length\""], "example": [">>> s1 = pd.Series(np.random.rand(100), name='feature1')", ">>> s2 = pd.Series(np.random.rand(100), name='feature2')", ">>> labels, ax = f_881(s1, s2, n_clusters=4)", ">>> print(ax.get_title())", "K-Means Clustering"]}} -{"task_id": "f_828", "prompt": "import pandas as pd\nimport json\nimport os\nimport math\n\n\ndef f_828(json_data, output_dir=\".\", file_name=\"country_population_report.csv\"):\n \"\"\"\n Generates a population report DataFrame and CSV file based on provided JSON data.\n\n Parameters:\n - json_data (str): Nested JSON string containing country names (str) as keys and\n populations (int) as values. The parent key is expected to be \"Countries\".\n Example format:\n '{\"Countries\": {\"Country A\": 331002651, \"Country B\": 67886011}}'.\n - output_dir (str): Directory path where the CSV report will be saved.\n Defaults to the current directory.\n The function will create it if it does not exist.\n - file_name (str): Name of the CSV report. Defaults to \"country_population_report.csv\".\n\n Returns:\n - str: The file path of the generated CSV report.\n - pd.DataFrame: The country-population data loaded from the input JSON, with columns:\n \"Country\", \"Population\".\n\n Raises:\n - ValueError: If the JSON data is malformed, empty, contains non-string country names,\n non-numeric or negative populations.\n - IOError: If the file cannot be written to the specified directory.\n\n Requirements:\n - json\n - os\n - pandas\n - math\n\n Notes:\n - Output DataFrame has no extra index column.\n - If this function encounters a float population that is otherwise valid, it will round it\n down to the nearest integer.\n\n Example:\n >>> json_str = '{\"Countries\": {\"Country A\": 331002651, \"Country B\": 67886011}}'\n >>> csv_file_path, df = f_828(json_str)\n >>> print(csv_file_path)\n ./country_population_report.csv\n >>> df\n Country Population\n 0 Country A 331002651\n 1 Country B 67886011\n \"\"\"", "canonical_solution": " os.makedirs(output_dir, exist_ok=True)\n file_path = os.path.join(output_dir, file_name)\n\n try:\n data = json.loads(json_data)\n except json.JSONDecodeError:\n raise ValueError(\"Invalid JSON data provided.\")\n\n country_data_dict = data.get(\"Countries\")\n\n if country_data_dict is None:\n raise ValueError(\"No valid country population data found in JSON.\")\n\n for country, population in country_data_dict.items():\n if not isinstance(country, str):\n raise ValueError(f\"Country name must be a string. Invalid entry: {country}\")\n if not isinstance(population, int):\n if isinstance(population, float):\n country_data_dict[country] = math.floor(population)\n else:\n raise ValueError(\n f\"Population must be an integer. Invalid entry for {country}: {population}\"\n )\n if population < 0:\n raise ValueError(\"Population cannot be negative.\")\n\n country_data = [\n [country, population] for country, population in country_data_dict.items()\n ]\n df = pd.DataFrame(country_data, columns=[\"Country\", \"Population\"])\n\n try:\n df.to_csv(file_path, index=False)\n except IOError as e:\n raise IOError(f\"Failed to write the CSV file to {output_dir}: {e}\")\n\n return file_path, df", "test": "import unittest\nimport os\nimport json\nimport pandas as pd\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.output_dir = self.temp_dir.name\n def tearDown(self):\n self.temp_dir.cleanup()\n def check_df_format(self, df):\n self.assertIsInstance(df, pd.DataFrame)\n self.assertTrue(\"Country\" in df.columns)\n self.assertTrue(\"Population\" in df.columns)\n def test_case_1(self):\n # Test basic case\n json_data = '{\"Countries\": {\"USA\": 331002651, \"UK\": 67886011}}'\n csv_file, df1 = f_828(json_data, self.output_dir)\n self.check_df_format(df1)\n self.assertTrue(os.path.exists(csv_file))\n df2 = pd.read_csv(csv_file)\n self.check_df_format(df2)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertTrue(df1.shape[0] == 2)\n self.assertEqual(df1.loc[df1.Country == \"USA\", \"Population\"].item(), 331002651)\n self.assertEqual(df1.loc[df1.Country == \"UK\", \"Population\"].item(), 67886011)\n def test_case_2(self):\n # Test with empty json\n json_data = \"{}\"\n with self.assertRaises(ValueError):\n f_828(json_data, self.output_dir)\n def test_case_3(self):\n # Test incorrect JSON format\n with self.assertRaises(ValueError):\n f_828('{\"WRONG\": {\"USA\": 331002651, \"UK\": 67886011}}', self.output_dir)\n with self.assertRaises(ValueError):\n f_828('{\"USA\": 331002651, \"UK\": 67886011}', self.output_dir)\n with self.assertRaises(ValueError):\n f_828('{\"Countries\": {\"USA\": 331002651, \"UK\"', self.output_dir)\n def test_case_4(self):\n # Test that output directory is created if it does not exist\n non_existing_dir = os.path.join(self.output_dir, \"new_directory\")\n self.assertFalse(\n os.path.exists(non_existing_dir), \"Directory already exists before test.\"\n )\n json_data = '{\"Countries\": {\"Country A\": 1000}}'\n _, _ = f_828(json_data, non_existing_dir)\n self.assertTrue(\n os.path.exists(non_existing_dir),\n \"Directory was not created by the function.\",\n )\n def test_case_5(self):\n # Test with country names that include special characters\n json_data = '{\"Countries\": {\"C\u00f4te d\\'Ivoire\": 26378274, \"S\u00e3o Tom\u00e9 and Pr\u00edncipe\": 219159}}'\n csv_file, df = f_828(json_data, self.output_dir)\n self.check_df_format(df)\n self.assertTrue(os.path.exists(csv_file))\n self.assertTrue(\"C\u00f4te d'Ivoire\" in df.Country.values)\n self.assertTrue(\"S\u00e3o Tom\u00e9 and Pr\u00edncipe\" in df.Country.values)\n def test_case_6(self):\n # Test with empty \"Countries\" object\n json_data = '{\"Countries\": {}}'\n csv_file, df = f_828(json_data, self.output_dir)\n self.check_df_format(df)\n self.assertTrue(os.path.exists(csv_file))\n self.assertTrue(df.empty)\n def test_case_7(self):\n # Test with non-numeric/negative population values\n with self.assertRaises(ValueError):\n f_828(\n '{\"Countries\": {\"Country X\": \"1000000\", \"Country Y\": null}}',\n self.output_dir,\n )\n with self.assertRaises(ValueError):\n f_828(\n '{\"Countries\": {\"Country X\": \"1000000\", \"Country Y\": \"ABC\"}}',\n self.output_dir,\n )\n with self.assertRaises(ValueError):\n f_828(\n '{\"Countries\": {\"Country X\": \"1000000\", \"Country Y\": -1}}',\n self.output_dir,\n )\n def test_case_8(self):\n # Test handling zero population\n json_data = '{\"Countries\": {\"Uninhabited Island\": 0}}'\n csv_file, df = f_828(json_data, self.output_dir)\n self.check_df_format(df)\n self.assertTrue(os.path.exists(csv_file))\n self.assertTrue(\"Uninhabited Island\" in df.Country.values)\n self.assertEqual(\n df.loc[df.Country == \"Uninhabited Island\", \"Population\"].item(), 0\n )\n def test_case_9(self):\n # Test handling valid floats - should be correctly rounded\n json_data = '{\"Countries\": {\"Country Float Pop\": 1234567.89, \"Another Country\": 98765.432}}'\n csv_file, df = f_828(json_data, self.output_dir)\n self.check_df_format(df)\n self.assertTrue(os.path.exists(csv_file))\n self.assertEqual(\n df.loc[df.Country == \"Country Float Pop\", \"Population\"].item(), 1234567\n )\n self.assertEqual(\n df.loc[df.Country == \"Another Country\", \"Population\"].item(), 98765\n )", "apis": ["math.floor", "pandas.DataFrame", "os.path", "json.JSONDecodeError", "json.loads", "os.makedirs", "os.path.join"], "libs": ["pandas", "json", "os", "math"], "doc": {"description": ["Generates a population report DataFrame and CSV file based on provided JSON data.", "Notes:", "- Output DataFrame has no extra index column.", "- If this function encounters a float population that is otherwise valid, it will round it", "down to the nearest integer."], "note": [], "params": ["json_data (str): Nested JSON string containing country names (str) as keys and", "populations (int) as values. The parent key is expected to be \"Countries\".", "Example format:", "'{\"Countries\": {\"Country A\": 331002651, \"Country B\": 67886011}}'.", "output_dir (str): Directory path where the CSV report will be saved.", "Defaults to the current directory.", "The function will create it if it does not exist.", "file_name (str): Name of the CSV report. Defaults to \"country_population_report.csv\"."], "returns": ["str: The file path of the generated CSV report.", "pd.DataFrame: The country-population data loaded from the input JSON, with columns:", "\"Country\", \"Population\"."], "reqs": ["json", "os", "pandas", "math"], "raises": ["ValueError: If the JSON data is malformed, empty, contains non-string country names,", "non-numeric or negative populations.", "IOError: If the file cannot be written to the specified directory."], "example": [">>> json_str = '{\"Countries\": {\"Country A\": 331002651, \"Country B\": 67886011}}'", ">>> csv_file_path, df = f_828(json_str)", ">>> print(csv_file_path)", "./country_population_report.csv", ">>> df", "Country Population", "0 Country A 331002651", "1 Country B 67886011"]}} -{"task_id": "f_902", "prompt": "import itertools\nimport random\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Constants\nSHAPES = [\n \"Circle\",\n \"Square\",\n \"Triangle\",\n \"Rectangle\",\n \"Pentagon\",\n \"Hexagon\",\n \"Heptagon\",\n \"Octagon\",\n \"Nonagon\",\n \"Decagon\",\n]\nCOLORS = [\n \"Red\",\n \"Blue\",\n \"Green\",\n \"Yellow\",\n \"Black\",\n \"White\",\n \"Purple\",\n \"Orange\",\n \"Pink\",\n \"Brown\",\n]\n\n\ndef f_902(num_pairs=10):\n \"\"\"\n Generate and display a countplot of predefined shape-color pairs.\n\n This function creates a visual representation of a specified number of unique shape-color combinations,\n each displayed as a bar in the countplot. The shape-color pairs are selected from a predefined list.\n\n Parameters:\n - num_pairs (int): The number of unique shape-color pairs to be displayed in the countplot.\n Default is 10. If the requested number is less than 1 or greater than the total\n possible unique combinations (100), it is adjusted to the valid range (1 to 100).\n\n Returns:\n - ax (matplotlib.axes._subplots.AxesSubplot): The Axes object of the countplot, which can be used for\n further customizations or to retrieve information about the plot.\n\n Requirements:\n - itertools\n - seaborn\n - matplotlib\n\n Example:\n >>> ax = f_902(10)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n >>> ax = f_902(9)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n >>> ax = f_902(8)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n >>> ax = f_902(7)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n >>> ax = f_902(6)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n \"\"\"", "canonical_solution": " max_pairs = len(SHAPES) * len(COLORS)\n num_pairs = min(num_pairs, max_pairs)\n \n pairs = [f\"{s}:{c}\" for s, c in itertools.product(SHAPES, COLORS)][:num_pairs]\n \n # Drawing the countplot\n ax = sns.countplot(x=pairs, hue=pairs, palette=\"Set3\", legend=False)\n plt.xticks(rotation=90)\n \n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_902.\"\"\"\n def tearDown(self):\n plt.clf()\n def test_basic_functionality(self):\n \"\"\"Test basic functionality with default parameters.\"\"\"\n random.seed(0)\n ax = f_902()\n self.assertIsInstance(ax, plt.Axes)\n def test_pair_count(self):\n \"\"\"Test if the number of displayed shape-color pairs matches the input.\"\"\"\n random.seed(1)\n num_pairs = 7\n ax = f_902(num_pairs)\n displayed_pairs = len(set(tick.get_text() for tick in ax.get_xticklabels()))\n self.assertEqual(displayed_pairs, num_pairs)\n def test_valid_pairs(self):\n \"\"\"Ensure displayed shape-color pairs are valid combinations.\"\"\"\n random.seed(2)\n ax = f_902(10)\n displayed_pairs = [tick.get_text() for tick in ax.get_xticklabels()]\n for pair in displayed_pairs:\n shape, color = pair.split(\":\")\n self.assertIn(shape, SHAPES)\n self.assertIn(color, COLORS)\n def test_max_pairs(self):\n \"\"\"Test with the maximum number of pairs possible.\"\"\"\n random.seed(3)\n max_pairs = len(SHAPES) * len(COLORS)\n ax = f_902(max_pairs)\n displayed_pairs = len(set(tick.get_text() for tick in ax.get_xticklabels()))\n self.assertEqual(displayed_pairs, max_pairs)\n def test_min_pairs(self):\n \"\"\"Test with the minimum number of pairs, which is 1.\"\"\"\n random.seed(4)\n ax = f_902(1)\n displayed_pairs = len(set(tick.get_text() for tick in ax.get_xticklabels()))\n self.assertEqual(displayed_pairs, 1)", "apis": ["matplotlib.pyplot.xticks", "seaborn.countplot", "itertools.product"], "libs": ["matplotlib", "itertools", "seaborn"], "doc": {"description": ["Generate and display a countplot of predefined shape-color pairs.", "This function creates a visual representation of a specified number of unique shape-color combinations,", "each displayed as a bar in the countplot. The shape-color pairs are selected from a predefined list."], "note": [], "params": ["num_pairs (int): The number of unique shape-color pairs to be displayed in the countplot.", "Default is 10. If the requested number is less than 1 or greater than the total", "possible unique combinations (100), it is adjusted to the valid range (1 to 100)."], "returns": ["ax (matplotlib.axes._subplots.AxesSubplot): The Axes object of the countplot, which can be used for", "further customizations or to retrieve information about the plot."], "reqs": ["itertools", "seaborn", "matplotlib"], "raises": [], "example": [">>> ax = f_902(10)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']", ">>> ax = f_902(9)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']", ">>> ax = f_902(8)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']", ">>> ax = f_902(7)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']", ">>> ax = f_902(6)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']"]}} -{"task_id": "f_846", "prompt": "import urllib.request\nfrom lxml import etree\nimport pandas as pd\n\n\ndef f_846(url):\n \"\"\"\n Fetches and parses an XML file from a specified URL, then converts it into a Pandas DataFrame.\n\n Parameters:\n url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL.\n \n Returns:\n pandas.DataFrame\n A DataFrame constructed from the parsed XML data. Each row of the DataFrame corresponds to an 'item' element\n in the XML file, with child elements of 'item' becoming columns in the DataFrame.\n\n Raises:\n ValueError\n This error is raised in several scenarios:\n 1. If the URL is invalid or the XML file cannot be fetched from the URL.\n 2. If the XML file has invalid syntax.\n 3. If the XML structure does not conform to the expected format.\n\n Requirements:\n - urllib\n - lxml\n - pandas\n\n Examples:\n # Example with a valid XML structure\n >>> df = f_846('http://example.com/sample_data.xml')\n >>> print(df)\n name age\n 0 John 25\n 1 Jane 30\n\n # Example with an invalid XML structure\n >>> df = f_846('http://example.com/invalid_structure.xml')\n ValueError: XML structure does not match expected format.\n \"\"\"", "canonical_solution": " try:\n with urllib.request.urlopen(url) as response:\n xml_data = response.read()\n except Exception as e:\n raise ValueError(f\"Error fetching the XML file: {e}\")\n\n try:\n xml_tree = etree.XML(xml_data)\n except etree.XMLSyntaxError:\n raise ValueError(\"Invalid XML syntax\")\n\n data = []\n for item in xml_tree.findall(\".//item\"):\n data_item = {child.tag: child.text for child in item}\n data.append(data_item)\n\n if not data:\n raise ValueError(\"XML structure does not match expected format.\")\n\n return pd.DataFrame(data)", "test": "import unittest\nimport pandas as pd\nfrom unittest.mock import patch\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_846 function.\"\"\"\n @patch(\"urllib.request.urlopen\")\n def test_valid_xml(self, mock_urlopen):\n \"\"\"Test that the function returns the correct DataFrame for a given XML file.\"\"\"\n # Mocking the XML data\n valid_xml_data = b\"John25Jane30\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n valid_xml_data\n )\n url = \"http://example.com/sample_data.xml\"\n expected_df = pd.DataFrame({\"name\": [\"John\", \"Jane\"], \"age\": [\"25\", \"30\"]})\n result_df = f_846(url)\n pd.testing.assert_frame_equal(result_df, expected_df)\n @patch(\"urllib.request.urlopen\")\n def test_empty_xml(self, mock_urlopen):\n \"\"\"Test that the function raises an error for an empty XML file.\"\"\"\n # Mocking empty XML data\n empty_xml_data = b\"\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n empty_xml_data\n )\n url = \"http://example.com/empty_data.xml\"\n with self.assertRaises(ValueError):\n f_846(url)\n @patch(\"urllib.request.urlopen\")\n def test_different_structure_xml(self, mock_urlopen):\n \"\"\"Test that the function raises an error for an XML file with a different structure.\"\"\"\n # Mocking XML with different structure\n different_structure_xml = (\n b\"John\"\n )\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n different_structure_xml\n )\n url = \"http://example.com/different_structure_data.xml\"\n with self.assertRaises(ValueError):\n f_846(url)\n @patch(\"urllib.request.urlopen\")\n def test_invalid_url(self, mock_urlopen):\n \"\"\"Test that the function raises an error for an invalid URL.\"\"\"\n # Simulate an error in URL fetching\n mock_urlopen.side_effect = Exception(\"URL fetch error\")\n url = \"http://example.com/nonexistent/file.xml\"\n with self.assertRaises(ValueError):\n f_846(url)\n @patch(\"urllib.request.urlopen\")\n def test_non_xml_data(self, mock_urlopen):\n \"\"\"Test that the function raises an error for non-XML data.\"\"\"\n # Mocking non-XML data\n non_xml_data = b\"Not an XML content\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n non_xml_data\n )\n url = \"http://example.com/non_xml_data.txt\"\n with self.assertRaises(ValueError):\n f_846(url)", "apis": ["lxml.etree.XMLSyntaxError", "urllib.request.urlopen", "lxml.etree.XML", "pandas.DataFrame", "urllib.request"], "libs": ["urllib", "pandas", "lxml"], "doc": {"description": ["Fetches and parses an XML file from a specified URL, then converts it into a Pandas DataFrame.", "# Example with an invalid XML structure", ">>> df = f_846('http://example.com/invalid_structure.xml')", "ValueError: XML structure does not match expected format."], "note": [], "params": ["url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL."], "returns": ["pandas.DataFrame", "A DataFrame constructed from the parsed XML data. Each row of the DataFrame corresponds to an 'item' element", "in the XML file, with child elements of 'item' becoming columns in the DataFrame."], "reqs": ["urllib", "lxml", "pandas"], "raises": ["ValueError", "This error is raised in several scenarios:", "1. If the URL is invalid or the XML file cannot be fetched from the URL.", "2. If the XML file has invalid syntax.", "3. If the XML structure does not conform to the expected format."], "example": ["Examples:", "# Example with a valid XML structure", ">>> df = f_846('http://example.com/sample_data.xml')", ">>> print(df)", "name age", "0 John 25", "1 Jane 30"]}} -{"task_id": "f_581", "prompt": "import pandas as pd\nfrom sklearn.model_selection import train_test_split\n\n\ndef f_581(df):\n \"\"\"\n Divide the given DataFrame into a training set and a test set (70%: 30% split), separate the \"target\" column and return the four resulting DataFrames.\n\n Parameters:\n - df (pd.DataFrame): pandas DataFrame that contains a column named 'target'.\n\n Returns:\n - tuple: A tuple containing four DataFrames: X_train, X_test, y_train, y_test.\n\n Requirements:\n - pandas\n - sklearn\n \n Example:\n >>> np.random.seed(42) # Ensure reproducibility\n >>> df = pd.DataFrame(np.random.randint(0, 100, size=(100, 5)), columns=list('ABCDE')) # Explicitly using np and pd\n >>> df['target'] = np.random.randint(0, 2, size=100) # Adding 'target' column using np\n >>> X_train, X_test, y_train, y_test = f_581(df)\n >>> print(X_train.shape) # Expected shape of training data\n (70, 5)\n \"\"\"", "canonical_solution": " X = pd.DataFrame.drop(df, 'target', axis=1)\n y = pd.DataFrame(df['target'])\n\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n\n return X_train, X_test, y_train, y_test", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.randint(0, 100, size=(100, 5)), columns=list('ABCDE'))\n df['target'] = np.random.randint(0, 2, size=100)\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (70, 5))\n self.assertEqual(X_test.shape, (30, 5))\n self.assertEqual(y_train.shape, (70, 1))\n self.assertEqual(y_test.shape, (30, 1))\n def test_case_2(self):\n df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'target': [0, 1, 0]})\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (2, 2))\n self.assertEqual(X_test.shape, (1, 2))\n self.assertEqual(y_train.shape, (2, 1))\n self.assertEqual(y_test.shape, (1, 1))\n def test_case_3(self):\n df = pd.DataFrame({'A': [0, 0, 0], 'B': [0, 0, 0], 'target': [0, 0, 0]})\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (2, 2))\n self.assertEqual(X_test.shape, (1, 2))\n self.assertEqual(y_train.shape, (2, 1))\n self.assertEqual(y_test.shape, (1, 1))\n self.assertEqual(X_train.iloc[0, 0], 0)\n self.assertEqual(X_train.iloc[0, 1], 0)\n self.assertEqual(X_train.iloc[1, 0], 0)\n self.assertEqual(X_train.iloc[1, 1], 0)\n self.assertEqual(X_test.iloc[0, 0], 0)\n self.assertEqual(X_test.iloc[0, 1], 0)\n self.assertEqual(y_train.iloc[0].to_list(), [0])\n self.assertEqual(y_train.iloc[1].to_list(), [0])\n self.assertEqual(y_test.iloc[0].to_list(), [0])\n def test_case_4(self):\n df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'target': [1, 1, 1]})\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (2, 2))\n self.assertEqual(X_test.shape, (1, 2))\n self.assertEqual(y_train.shape, (2, 1))\n self.assertEqual(y_test.shape, (1, 1))\n \n def test_case_5(self):\n df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'target': [0, 0, 0]})\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (2, 2))\n self.assertEqual(X_test.shape, (1, 2))\n self.assertEqual(y_train.shape, (2, 1))\n self.assertEqual(y_test.shape, (1, 1))", "apis": ["sklearn.model_selection.train_test_split", "pandas.DataFrame.drop", "pandas.DataFrame"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Divide the given DataFrame into a training set and a test set (70%: 30% split), separate the \"target\" column and return the four resulting DataFrames."], "note": [], "params": ["df (pd.DataFrame): pandas DataFrame that contains a column named 'target'."], "returns": ["tuple: A tuple containing four DataFrames: X_train, X_test, y_train, y_test."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> np.random.seed(42) # Ensure reproducibility", ">>> df = pd.DataFrame(np.random.randint(0, 100, size=(100, 5)), columns=list('ABCDE')) # Explicitly using np and pd", ">>> df['target'] = np.random.randint(0, 2, size=100) # Adding 'target' column using np", ">>> X_train, X_test, y_train, y_test = f_581(df)", ">>> print(X_train.shape) # Expected shape of training data", "(70, 5)"]}} -{"task_id": "f_877", "prompt": "import pandas as pd\nimport numpy as np\n\n\nCATEGORIES = [\"Electronics\", \"Clothing\", \"Home Decor\", \"Automotive\", \"Books\"]\n\n\ndef f_877(s1, s2):\n \"\"\"\n Compares and visualizes the sales data of two stores for predefined categories.\n The function generates a bar plot for categories where both stores have sales exceeding a specified threshold.\n The Euclidean distance between the two series is also computed.\n \n Parameters:\n s1 (pd.Series): Sales data for store 1, indexed by categories.\n s2 (pd.Series): Sales data for store 2, indexed by categories.\n\n Returns:\n matplotlib.axes.Axes or None: A bar plot for categories where both stores' sales exceed the threshold of 200,\n or None if no such categories exist.\n float: The Euclidean distance between the two series or 0.0 if no categories meet the threshold.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> np.random.seed(seed=32)\n >>> s1 = pd.Series(np.random.randint(100, 500, size=5), index=CATEGORIES)\n >>> s2 = pd.Series(np.random.randint(150, 600, size=5), index=CATEGORIES)\n >>> ax, edit_distance = f_877(s1, s2)\n >>> ax.get_title()\n 'Sales Comparison Above Threshold in Categories'\n >>> edit_distance\n 387.5590277622236\n \"\"\"", "canonical_solution": "\n # Determine categories where both stores exceed the sales threshold\n high_sales_categories = s1.index[(s1 > 200) & (s2 > 200)]\n\n if high_sales_categories.empty:\n return None, 0.0\n\n # Prepare the data for plotting\n df = pd.DataFrame(\n {\"Store 1\": s1[high_sales_categories], \"Store 2\": s2[high_sales_categories]}\n )\n\n # compute the edit distance between the two series\n edit_distance = np.linalg.norm(df[\"Store 1\"] - df[\"Store 2\"])\n \n # Generate the bar plot\n ax = df.plot(kind=\"bar\", title=\"Sales Comparison Above Threshold in Categories\")\n return ax, edit_distance", "test": "import pandas as pd\nimport numpy as np\nimport unittest\nimport matplotlib.pyplot as plt\n# Constants (should be kept consistent with function.py)\nCATEGORIES = [\"Electronics\", \"Clothing\", \"Home Decor\", \"Automotive\", \"Books\"]\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for function f_877.\"\"\"\n def test_sales_above_threshold(self):\n \"\"\"Test that the function returns a plot when sales exceed the threshold\"\"\"\n np.random.seed(seed=32)\n s1 = pd.Series(np.random.randint(100, 500, size=5), index=CATEGORIES)\n np.random.seed(seed=32)\n s2 = pd.Series(np.random.randint(150, 600, size=5), index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check the correct categories are plotted\n categories_plotted = [label.get_text() for label in ax.get_xticklabels()]\n self.assertListEqual(\n categories_plotted, [\"Electronics\", \"Home Decor\", \"Automotive\", \"Books\"]\n )\n # Check the title of the plot\n self.assertEqual(\n ax.get_title(), \"Sales Comparison Above Threshold in Categories\"\n )\n self.assertAlmostEqual(edit_distance, 100.0)\n \n def test_no_sales_above_threshold(self):\n \"\"\"Test that no categories are plotted when no sales exceed the threshold\"\"\"\n np.random.seed(seed=32)\n s1 = pd.Series(np.random.randint(50, 150, size=5), index=CATEGORIES)\n np.random.seed(seed=32)\n s2 = pd.Series(np.random.randint(50, 150, size=5), index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check that no categories are plotted\n self.assertIsNone(\n ax, \"Expected None as no categories should meet the threshold\"\n )\n self.assertAlmostEqual(edit_distance, 0.0)\n def test_all_sales_above_threshold(self):\n \"\"\"Test that all categories are plotted when all sales exceed the threshold\"\"\"\n np.random.seed(seed=123)\n s1 = pd.Series(np.random.randint(200, 500, size=5), index=CATEGORIES)\n np.random.seed(seed=123)\n s2 = pd.Series(np.random.randint(250, 600, size=5), index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check that all categories are plotted\n categories_plotted = [label.get_text() for label in ax.get_xticklabels()]\n self.assertListEqual(categories_plotted, CATEGORIES)\n self.assertAlmostEqual(edit_distance, 389.8127755730948)\n \n def test_some_sales_above_threshold(self):\n \"\"\"Test that some categories are plotted when some sales exceed the threshold\"\"\"\n s1 = pd.Series([250, 180, 290, 200, 290], index=CATEGORIES)\n s2 = pd.Series([260, 290, 195, 299, 295], index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check that only the correct categories are plotted\n categories_plotted = [label.get_text() for label in ax.get_xticklabels()]\n self.assertListEqual(categories_plotted, [\"Electronics\", \"Books\"])\n self.assertAlmostEqual(edit_distance, 11.180339887498949)\n \n def test_single_sales_above_threshold(self):\n \"\"\"Test that only a single category is plotted when only a single category has sales exceeding the threshold\"\"\"\n s1 = pd.Series([150, 180, 290, 200, 190], index=CATEGORIES)\n s2 = pd.Series([160, 190, 295, 199, 195], index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check that only a single category is plotted\n categories_plotted = [label.get_text() for label in ax.get_xticklabels()]\n self.assertListEqual(categories_plotted, [\"Home Decor\"])\n self.assertAlmostEqual(edit_distance, 5.0)\n \n def tearDown(self):\n plt.close()", "apis": ["numpy.linalg.norm", "pandas.DataFrame", "numpy.linalg"], "libs": ["numpy", "pandas"], "doc": {"description": ["Compares and visualizes the sales data of two stores for predefined categories.", "The function generates a bar plot for categories where both stores have sales exceeding a specified threshold.", "The Euclidean distance between the two series is also computed."], "note": [], "params": ["s1 (pd.Series): Sales data for store 1, indexed by categories.", "s2 (pd.Series): Sales data for store 2, indexed by categories."], "returns": ["matplotlib.axes.Axes or None: A bar plot for categories where both stores' sales exceed the threshold of 200,", "or None if no such categories exist.", "float: The Euclidean distance between the two series or 0.0 if no categories meet the threshold."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> np.random.seed(seed=32)", ">>> s1 = pd.Series(np.random.randint(100, 500, size=5), index=CATEGORIES)", ">>> s2 = pd.Series(np.random.randint(150, 600, size=5), index=CATEGORIES)", ">>> ax, edit_distance = f_877(s1, s2)", ">>> ax.get_title()", "'Sales Comparison Above Threshold in Categories'", ">>> edit_distance", "387.5590277622236"]}} +{"task_id": "f_901", "prompt": "import pandas as pd\nimport itertools\nimport numpy as np\n\n\ndef f_901(animals=None, foods=None):\n \"\"\"\n Create a DataFrame with combinations of animals and foods in a 'animal:food' format.\n\n Parameters:\n - animals (list of str, optional): A list of animal names. If not provided, \n defaults to a predefined list of common animals including 'Dog', 'Cat', 'Elephant', 'Tiger', 'Lion', 'Zebra', 'Giraffe', 'Bear', 'Monkey', 'Kangaroo'.\n - foods (list of str, optional): A list of food names. If not provided, \n defaults to a predefined list of common foods including 'Meat', 'Fish', 'Grass', 'Fruits', 'Insects', 'Seeds', 'Leaves'.\n\n Returns:\n - df (pandas.DataFrame): A DataFrame where each row represents a unique animal from the 'animals' \n list and each column represents a food item from the 'foods' list. Each cell contains a string in the format 'animal:food'.\n\n Handling of Special Cases:\n - If both 'animals' and 'foods' lists are empty or not provided, the function returns an empty DataFrame.\n - If either 'animals' or 'foods' list is empty or not provided, the function uses its predefined list for the missing parameter.\n\n Requirements:\n - pandas\n - numpy\n - itertools\n\n Example:\n >>> animal_food_pairs = f_901(['Dog', 'Cat'], ['Meat', 'Fish'])\n >>> print(animal_food_pairs)\n Meat Fish\n 0 Dog:Meat Dog:Fish\n 1 Cat:Meat Cat:Fish\n\n Note:\n - The function generates all possible combinations of the provided 'animals' and 'foods' using itertools.product.\n - The resulting pairs are shuffled randomly to ensure variety in the DataFrame layout.\n \"\"\"", "canonical_solution": "\n # Default lists if not provided\n if animals is None:\n animals = [\n \"Dog\",\n \"Cat\",\n \"Elephant\",\n \"Tiger\",\n \"Lion\",\n \"Zebra\",\n \"Giraffe\",\n \"Bear\",\n \"Monkey\",\n \"Kangaroo\",\n ]\n if foods is None:\n foods = [\"Meat\", \"Fish\", \"Grass\", \"Fruits\", \"Insects\", \"Seeds\", \"Leaves\"]\n\n # Handling edge case of empty lists\n if not animals or not foods:\n return pd.DataFrame()\n\n pairs = [f\"{a}:{f}\" for a, f in itertools.product(animals, foods)]\n\n # Reshape the data and create a DataFrame\n data = np.array(pairs).reshape(-1, len(foods))\n df = pd.DataFrame(data, columns=foods)\n\n return df", "test": "import unittest\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for the function f_901.\"\"\"\n def test_default_input(self):\n \"\"\"Test with default inputs for animals and foods.\"\"\"\n random.seed(0)\n # Scenario: Testing with default inputs for animals and foods\n result = f_901()\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (10, 7),\n \"The shape of the DataFrame with default inputs is not as expected.\",\n )\n def test_custom_input(self):\n \"\"\"Test with custom inputs for animals and foods.\"\"\"\n random.seed(1)\n # Scenario: Testing with custom lists of animals and foods\n animals = [\"Dog\", \"Cat\", \"Elephant\"]\n foods = [\"Meat\", \"Fish\", \"Grass\", \"Fruits\"]\n result = f_901(animals, foods)\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (3, 4),\n \"The shape of the DataFrame with custom inputs is not as expected.\",\n )\n def test_empty_input(self):\n \"\"\"Test with empty lists for animals and foods.\"\"\"\n random.seed(2)\n # Scenario: Testing with empty lists for animals and foods\n animals = []\n foods = []\n result = f_901(animals, foods)\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (0, 0),\n \"The shape of the DataFrame with empty inputs is not as expected.\",\n )\n def test_single_input(self):\n \"\"\"Test with a single animal and a single food.\"\"\"\n random.seed(3)\n # Scenario: Testing with a single animal and a single food\n animals = [\"Dog\"]\n foods = [\"Meat\"]\n result = f_901(animals, foods)\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (1, 1),\n \"The shape of the DataFrame with a single input is not as expected.\",\n )\n # Check if the pairs are correct\n self.assertIn(\n \"Dog:Meat\",\n result.values,\n \"The expected pair 'Dog:Meat' was not found in the resulting DataFrame.\",\n )\n def test_partial_default(self):\n \"\"\"Test with a custom list of animals and default list of foods.\"\"\"\n random.seed(4)\n # Scenario: Testing with a custom list of animals and default list of foods\n animals = [\"Dog\", \"Cat\", \"Elephant\"]\n result = f_901(animals)\n # Check the shape of the returned DataFrame\n self.assertEqual(\n result.shape,\n (3, 7),\n \"The shape of the DataFrame with partial default inputs is not as expected.\",\n )", "apis": ["pandas.DataFrame", "itertools.product", "numpy.array"], "libs": ["itertools", "numpy", "pandas"], "doc": {"description": ["Create a DataFrame with combinations of animals and foods in a 'animal:food' format.", "Handling of Special Cases:", "- If both 'animals' and 'foods' lists are empty or not provided, the function returns an empty DataFrame.", "- If either 'animals' or 'foods' list is empty or not provided, the function uses its predefined list for the missing parameter."], "note": ["The function generates all possible combinations of the provided 'animals' and 'foods' using itertools.product.", "The resulting pairs are shuffled randomly to ensure variety in the DataFrame layout."], "params": ["animals (list of str, optional): A list of animal names. If not provided,", "defaults to a predefined list of common animals including 'Dog', 'Cat', 'Elephant', 'Tiger', 'Lion', 'Zebra', 'Giraffe', 'Bear', 'Monkey', 'Kangaroo'.", "foods (list of str, optional): A list of food names. If not provided,", "defaults to a predefined list of common foods including 'Meat', 'Fish', 'Grass', 'Fruits', 'Insects', 'Seeds', 'Leaves'."], "returns": ["df (pandas.DataFrame): A DataFrame where each row represents a unique animal from the 'animals'", "list and each column represents a food item from the 'foods' list. Each cell contains a string in the format 'animal:food'."], "reqs": ["pandas", "numpy", "itertools"], "raises": [], "example": [">>> animal_food_pairs = f_901(['Dog', 'Cat'], ['Meat', 'Fish'])", ">>> print(animal_food_pairs)", "Meat Fish", "0 Dog:Meat Dog:Fish", "1 Cat:Meat Cat:Fish"]}} +{"task_id": "f_881", "prompt": "import pandas as pd\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\n\n\ndef f_881(s1, s2, n_clusters=3):\n \"\"\"\n Perform K-Means clustering on data points from two pandas Series and visualize the clusters.\n\n Parameters:\n - s1 (pandas.Series): The first series of data. Each value in the series represents a data point's coordinate along one dimension.\n - s2 (pandas.Series): The second series of data. Each value corresponds to a data point's coordinate along another dimension. The length of s2 must match that of s1.\n - n_clusters (int, optional): The number of clusters to form as well as the number of centroids to generate. Defaults to 3.\n\n Returns:\n - tuple: A tuple containing the following elements:\n - ndarray: An array of cluster labels indicating the cluster each data point belongs to.\n - matplotlib.axes.Axes: The Axes object of the plot, which shows the data points colored according to their cluster labels.\n\n Raises:\n - ValueError: If either s1 or s2 is not a pandas Series, raise \"s1 and s2 must be pandas Series\"\n - ValueError: If s1 and s2 have different lengths, raise \"s1 and s2 must have the same length\"\n\n Requirements:\n - pandas\n - scikit-learn\n - matplotlib\n\n Notes:\n - The function needs to ensure that s1 and s2 are pandas Series of equal length. \n - It then performs K-Means clustering on the combined data points from s1 and s2. \n - After clustering, it creates a scatter plot where each cluster is visualized with a different color. \n - The plot title is set to \"K-Means Clustering\" to describe the visualization technique. \n - A legend is added, which uses elements from the scatter plot to describe each cluster.\n \n Example:\n >>> s1 = pd.Series(np.random.rand(100), name='feature1')\n >>> s2 = pd.Series(np.random.rand(100), name='feature2')\n >>> labels, ax = f_881(s1, s2, n_clusters=4)\n >>> print(ax.get_title())\n K-Means Clustering\n\n \n \"\"\"", "canonical_solution": " if not isinstance(s1, pd.Series) or not isinstance(s2, pd.Series):\n raise ValueError(\"s1 and s2 must be pandas Series\")\n\n if len(s1) != len(s2):\n raise ValueError(\"s1 and s2 must have the same length\")\n\n # Create a DataFrame from the series\n df = pd.concat([s1, s2], axis=1)\n\n # Perform K-Means clustering\n kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)\n labels = kmeans.fit_predict(df)\n\n # Visualize the clusters\n _, ax = plt.subplots()\n scatter = ax.scatter(df[s1.name], df[s2.name], c=labels)\n ax.set_xlabel(s1.name)\n ax.set_ylabel(s2.name)\n ax.set_title(\"K-Means Clustering\")\n plt.legend(*scatter.legend_elements(), title=\"Clusters\")\n\n return labels, ax", "test": "import pandas as pd\nimport numpy as np\nimport unittest\nimport os\nfrom sklearn.datasets import make_blobs\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_881.\"\"\"\n def setUp(self) -> None:\n os.environ[\"LOKY_MAX_CPU_COUNT\"] = \"2\"\n def test_random_data_size_100(self):\n \"\"\"Test with random data of size 100 and default number of clusters\"\"\"\n np.random.seed(42)\n s1 = pd.Series(np.random.rand(100), name=\"feature1\")\n np.random.seed(0)\n s2 = pd.Series(np.random.rand(100), name=\"feature2\")\n labels, ax = f_881(s1, s2)\n # Check if labels are ndarray\n self.assertIsInstance(labels, np.ndarray)\n # Check the plot's title\n self.assertEqual(ax.get_title(), \"K-Means Clustering\")\n def test_random_data_custom_clusters(self):\n \"\"\"Test with random data of size 100 and custom number of clusters\"\"\"\n np.random.seed(42)\n s1 = pd.Series(np.random.rand(100), name=\"feature1\")\n np.random.seed(0)\n s2 = pd.Series(np.random.rand(100), name=\"feature2\")\n labels, ax = f_881(s1, s2, n_clusters=5)\n # Check if labels are ndarray\n self.assertIsInstance(labels, np.ndarray)\n self.assertEqual(len(set(labels)), 5)\n # Check the plot's title\n self.assertEqual(ax.get_title(), \"K-Means Clustering\")\n def test_invalid_input_non_series(self):\n \"\"\"Test with invalid input types (non-Series)\"\"\"\n with self.assertRaises(ValueError):\n f_881([1, 2, 3], pd.Series([4, 5, 6]))\n def test_invalid_input_mismatched_length(self):\n \"\"\"Test with mismatched length of Series\"\"\"\n s1 = pd.Series([1, 2, 3], name=\"feature1\")\n s2 = pd.Series([4, 5], name=\"feature2\")\n with self.assertRaises(ValueError):\n f_881(s1, s2)\n def test_custom_clusters_with_synthetic_data(self):\n \"\"\"Test with synthetic data and custom number of clusters using make_blobs\"\"\"\n # Generate synthetic data with 2 distinct clusters\n X, _ = make_blobs(n_samples=100, centers=2, random_state=42)\n # Convert to pandas Series\n s1 = pd.Series(X[:, 0], name=\"feature1\")\n s2 = pd.Series(X[:, 1], name=\"feature2\")\n # Run the clustering function\n labels, ax = f_881(s1, s2, n_clusters=2)\n # Check if labels are ndarray\n self.assertIsInstance(labels, np.ndarray)\n # Check the number of unique labels (should be 2 for 2 clusters)\n self.assertEqual(len(set(labels)), 2)\n # Check the plot's title\n self.assertEqual(ax.get_title(), \"K-Means Clustering\")\n def tearDown(self):\n plt.clf()", "apis": ["matplotlib.pyplot.legend", "sklearn.cluster.KMeans", "pandas.Series", "matplotlib.pyplot.subplots", "pandas.concat"], "libs": ["sklearn", "pandas", "matplotlib"], "doc": {"description": ["Perform K-Means clustering on data points from two pandas Series and visualize the clusters.", "Notes:", "- The function needs to ensure that s1 and s2 are pandas Series of equal length.", "- It then performs K-Means clustering on the combined data points from s1 and s2.", "- After clustering, it creates a scatter plot where each cluster is visualized with a different color.", "- The plot title is set to \"K-Means Clustering\" to describe the visualization technique.", "- A legend is added, which uses elements from the scatter plot to describe each cluster."], "note": [], "params": ["s1 (pandas.Series): The first series of data. Each value in the series represents a data point's coordinate along one dimension.", "s2 (pandas.Series): The second series of data. Each value corresponds to a data point's coordinate along another dimension. The length of s2 must match that of s1.", "n_clusters (int, optional): The number of clusters to form as well as the number of centroids to generate. Defaults to 3."], "returns": ["tuple: A tuple containing the following elements:", "ndarray: An array of cluster labels indicating the cluster each data point belongs to.", "matplotlib.axes.Axes: The Axes object of the plot, which shows the data points colored according to their cluster labels."], "reqs": ["pandas", "scikit-learn", "matplotlib"], "raises": ["ValueError: If either s1 or s2 is not a pandas Series, raise \"s1 and s2 must be pandas Series\"", "ValueError: If s1 and s2 have different lengths, raise \"s1 and s2 must have the same length\""], "example": [">>> s1 = pd.Series(np.random.rand(100), name='feature1')", ">>> s2 = pd.Series(np.random.rand(100), name='feature2')", ">>> labels, ax = f_881(s1, s2, n_clusters=4)", ">>> print(ax.get_title())", "K-Means Clustering"]}} +{"task_id": "f_828", "prompt": "import pandas as pd\nimport json\nimport os\nimport math\n\n\ndef f_828(json_data, output_dir=\".\", file_name=\"country_population_report.csv\"):\n \"\"\"\n Generates a population report DataFrame and CSV file based on provided JSON data.\n\n Parameters:\n - json_data (str): Nested JSON string containing country names (str) as keys and\n populations (int) as values. The parent key is expected to be \"Countries\".\n Example format:\n '{\"Countries\": {\"Country A\": 331002651, \"Country B\": 67886011}}'.\n - output_dir (str): Directory path where the CSV report will be saved.\n Defaults to the current directory.\n The function will create it if it does not exist.\n - file_name (str): Name of the CSV report. Defaults to \"country_population_report.csv\".\n\n Returns:\n - str: The file path of the generated CSV report.\n - pd.DataFrame: The country-population data loaded from the input JSON, with columns:\n \"Country\", \"Population\".\n\n Raises:\n - ValueError: If the JSON data is malformed, empty, contains non-string country names,\n non-numeric or negative populations.\n - IOError: If the file cannot be written to the specified directory.\n\n Requirements:\n - json\n - os\n - pandas\n - math\n\n Notes:\n - Output DataFrame has no extra index column.\n - If this function encounters a float population that is otherwise valid, it will round it\n down to the nearest integer.\n\n Example:\n >>> json_str = '{\"Countries\": {\"Country A\": 331002651, \"Country B\": 67886011}}'\n >>> csv_file_path, df = f_828(json_str)\n >>> print(csv_file_path)\n ./country_population_report.csv\n >>> df\n Country Population\n 0 Country A 331002651\n 1 Country B 67886011\n \"\"\"", "canonical_solution": " os.makedirs(output_dir, exist_ok=True)\n file_path = os.path.join(output_dir, file_name)\n\n try:\n data = json.loads(json_data)\n except json.JSONDecodeError:\n raise ValueError(\"Invalid JSON data provided.\")\n\n country_data_dict = data.get(\"Countries\")\n\n if country_data_dict is None:\n raise ValueError(\"No valid country population data found in JSON.\")\n\n for country, population in country_data_dict.items():\n if not isinstance(country, str):\n raise ValueError(f\"Country name must be a string. Invalid entry: {country}\")\n if not isinstance(population, int):\n if isinstance(population, float):\n country_data_dict[country] = math.floor(population)\n else:\n raise ValueError(\n f\"Population must be an integer. Invalid entry for {country}: {population}\"\n )\n if population < 0:\n raise ValueError(\"Population cannot be negative.\")\n\n country_data = [\n [country, population] for country, population in country_data_dict.items()\n ]\n df = pd.DataFrame(country_data, columns=[\"Country\", \"Population\"])\n\n try:\n df.to_csv(file_path, index=False)\n except IOError as e:\n raise IOError(f\"Failed to write the CSV file to {output_dir}: {e}\")\n\n return file_path, df", "test": "import unittest\nimport os\nimport json\nimport pandas as pd\nimport tempfile\nclass TestCases(unittest.TestCase):\n def setUp(self):\n self.temp_dir = tempfile.TemporaryDirectory()\n self.output_dir = self.temp_dir.name\n def tearDown(self):\n self.temp_dir.cleanup()\n def check_df_format(self, df):\n self.assertIsInstance(df, pd.DataFrame)\n self.assertTrue(\"Country\" in df.columns)\n self.assertTrue(\"Population\" in df.columns)\n def test_case_1(self):\n # Test basic case\n json_data = '{\"Countries\": {\"USA\": 331002651, \"UK\": 67886011}}'\n csv_file, df1 = f_828(json_data, self.output_dir)\n self.check_df_format(df1)\n self.assertTrue(os.path.exists(csv_file))\n df2 = pd.read_csv(csv_file)\n self.check_df_format(df2)\n pd.testing.assert_frame_equal(df1, df2)\n self.assertTrue(df1.shape[0] == 2)\n self.assertEqual(df1.loc[df1.Country == \"USA\", \"Population\"].item(), 331002651)\n self.assertEqual(df1.loc[df1.Country == \"UK\", \"Population\"].item(), 67886011)\n def test_case_2(self):\n # Test with empty json\n json_data = \"{}\"\n with self.assertRaises(ValueError):\n f_828(json_data, self.output_dir)\n def test_case_3(self):\n # Test incorrect JSON format\n with self.assertRaises(ValueError):\n f_828('{\"WRONG\": {\"USA\": 331002651, \"UK\": 67886011}}', self.output_dir)\n with self.assertRaises(ValueError):\n f_828('{\"USA\": 331002651, \"UK\": 67886011}', self.output_dir)\n with self.assertRaises(ValueError):\n f_828('{\"Countries\": {\"USA\": 331002651, \"UK\"', self.output_dir)\n def test_case_4(self):\n # Test that output directory is created if it does not exist\n non_existing_dir = os.path.join(self.output_dir, \"new_directory\")\n self.assertFalse(\n os.path.exists(non_existing_dir), \"Directory already exists before test.\"\n )\n json_data = '{\"Countries\": {\"Country A\": 1000}}'\n _, _ = f_828(json_data, non_existing_dir)\n self.assertTrue(\n os.path.exists(non_existing_dir),\n \"Directory was not created by the function.\",\n )\n def test_case_5(self):\n # Test with country names that include special characters\n json_data = '{\"Countries\": {\"C\u00f4te d\\'Ivoire\": 26378274, \"S\u00e3o Tom\u00e9 and Pr\u00edncipe\": 219159}}'\n csv_file, df = f_828(json_data, self.output_dir)\n self.check_df_format(df)\n self.assertTrue(os.path.exists(csv_file))\n self.assertTrue(\"C\u00f4te d'Ivoire\" in df.Country.values)\n self.assertTrue(\"S\u00e3o Tom\u00e9 and Pr\u00edncipe\" in df.Country.values)\n def test_case_6(self):\n # Test with empty \"Countries\" object\n json_data = '{\"Countries\": {}}'\n csv_file, df = f_828(json_data, self.output_dir)\n self.check_df_format(df)\n self.assertTrue(os.path.exists(csv_file))\n self.assertTrue(df.empty)\n def test_case_7(self):\n # Test with non-numeric/negative population values\n with self.assertRaises(ValueError):\n f_828(\n '{\"Countries\": {\"Country X\": \"1000000\", \"Country Y\": null}}',\n self.output_dir,\n )\n with self.assertRaises(ValueError):\n f_828(\n '{\"Countries\": {\"Country X\": \"1000000\", \"Country Y\": \"ABC\"}}',\n self.output_dir,\n )\n with self.assertRaises(ValueError):\n f_828(\n '{\"Countries\": {\"Country X\": \"1000000\", \"Country Y\": -1}}',\n self.output_dir,\n )\n def test_case_8(self):\n # Test handling zero population\n json_data = '{\"Countries\": {\"Uninhabited Island\": 0}}'\n csv_file, df = f_828(json_data, self.output_dir)\n self.check_df_format(df)\n self.assertTrue(os.path.exists(csv_file))\n self.assertTrue(\"Uninhabited Island\" in df.Country.values)\n self.assertEqual(\n df.loc[df.Country == \"Uninhabited Island\", \"Population\"].item(), 0\n )\n def test_case_9(self):\n # Test handling valid floats - should be correctly rounded\n json_data = '{\"Countries\": {\"Country Float Pop\": 1234567.89, \"Another Country\": 98765.432}}'\n csv_file, df = f_828(json_data, self.output_dir)\n self.check_df_format(df)\n self.assertTrue(os.path.exists(csv_file))\n self.assertEqual(\n df.loc[df.Country == \"Country Float Pop\", \"Population\"].item(), 1234567\n )\n self.assertEqual(\n df.loc[df.Country == \"Another Country\", \"Population\"].item(), 98765\n )", "apis": ["json.loads", "math.floor", "pandas.DataFrame", "os.makedirs", "os.path", "json.JSONDecodeError", "os.path.join"], "libs": ["os", "json", "pandas", "math"], "doc": {"description": ["Generates a population report DataFrame and CSV file based on provided JSON data.", "Notes:", "- Output DataFrame has no extra index column.", "- If this function encounters a float population that is otherwise valid, it will round it", "down to the nearest integer."], "note": [], "params": ["json_data (str): Nested JSON string containing country names (str) as keys and", "populations (int) as values. The parent key is expected to be \"Countries\".", "Example format:", "'{\"Countries\": {\"Country A\": 331002651, \"Country B\": 67886011}}'.", "output_dir (str): Directory path where the CSV report will be saved.", "Defaults to the current directory.", "The function will create it if it does not exist.", "file_name (str): Name of the CSV report. Defaults to \"country_population_report.csv\"."], "returns": ["str: The file path of the generated CSV report.", "pd.DataFrame: The country-population data loaded from the input JSON, with columns:", "\"Country\", \"Population\"."], "reqs": ["json", "os", "pandas", "math"], "raises": ["ValueError: If the JSON data is malformed, empty, contains non-string country names,", "non-numeric or negative populations.", "IOError: If the file cannot be written to the specified directory."], "example": [">>> json_str = '{\"Countries\": {\"Country A\": 331002651, \"Country B\": 67886011}}'", ">>> csv_file_path, df = f_828(json_str)", ">>> print(csv_file_path)", "./country_population_report.csv", ">>> df", "Country Population", "0 Country A 331002651", "1 Country B 67886011"]}} +{"task_id": "f_902", "prompt": "import itertools\nimport random\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# Constants\nSHAPES = [\n \"Circle\",\n \"Square\",\n \"Triangle\",\n \"Rectangle\",\n \"Pentagon\",\n \"Hexagon\",\n \"Heptagon\",\n \"Octagon\",\n \"Nonagon\",\n \"Decagon\",\n]\nCOLORS = [\n \"Red\",\n \"Blue\",\n \"Green\",\n \"Yellow\",\n \"Black\",\n \"White\",\n \"Purple\",\n \"Orange\",\n \"Pink\",\n \"Brown\",\n]\n\n\ndef f_902(num_pairs=10):\n \"\"\"\n Generate and display a countplot of predefined shape-color pairs.\n\n This function creates a visual representation of a specified number of unique shape-color combinations,\n each displayed as a bar in the countplot. The shape-color pairs are selected from a predefined list.\n\n Parameters:\n - num_pairs (int): The number of unique shape-color pairs to be displayed in the countplot.\n Default is 10. If the requested number is less than 1 or greater than the total\n possible unique combinations (100), it is adjusted to the valid range (1 to 100).\n\n Returns:\n - ax (matplotlib.axes._subplots.Axes): The Axes object of the countplot, which can be used for\n further customizations or to retrieve information about the plot.\n\n Requirements:\n - itertools\n - seaborn\n - matplotlib\n\n Example:\n >>> ax = f_902(10)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n >>> ax = f_902(9)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n >>> ax = f_902(8)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n >>> ax = f_902(7)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n >>> ax = f_902(6)\n >>> [tick.get_text() for tick in ax.get_xticklabels()]\n ['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']\n \"\"\"", "canonical_solution": " max_pairs = len(SHAPES) * len(COLORS)\n num_pairs = min(num_pairs, max_pairs)\n \n pairs = [f\"{s}:{c}\" for s, c in itertools.product(SHAPES, COLORS)][:num_pairs]\n \n # Drawing the countplot\n ax = sns.countplot(x=pairs, hue=pairs, palette=\"Set3\", legend=False)\n plt.xticks(rotation=90)\n \n return ax", "test": "import unittest\nimport matplotlib.pyplot as plt\nimport random\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for f_902.\"\"\"\n def tearDown(self):\n plt.clf()\n def test_basic_functionality(self):\n \"\"\"Test basic functionality with default parameters.\"\"\"\n random.seed(0)\n ax = f_902()\n self.assertIsInstance(ax, plt.Axes)\n def test_pair_count(self):\n \"\"\"Test if the number of displayed shape-color pairs matches the input.\"\"\"\n random.seed(1)\n num_pairs = 7\n ax = f_902(num_pairs)\n displayed_pairs = len(set(tick.get_text() for tick in ax.get_xticklabels()))\n self.assertEqual(displayed_pairs, num_pairs)\n def test_valid_pairs(self):\n \"\"\"Ensure displayed shape-color pairs are valid combinations.\"\"\"\n random.seed(2)\n ax = f_902(10)\n displayed_pairs = [tick.get_text() for tick in ax.get_xticklabels()]\n for pair in displayed_pairs:\n shape, color = pair.split(\":\")\n self.assertIn(shape, SHAPES)\n self.assertIn(color, COLORS)\n def test_max_pairs(self):\n \"\"\"Test with the maximum number of pairs possible.\"\"\"\n random.seed(3)\n max_pairs = len(SHAPES) * len(COLORS)\n ax = f_902(max_pairs)\n displayed_pairs = len(set(tick.get_text() for tick in ax.get_xticklabels()))\n self.assertEqual(displayed_pairs, max_pairs)\n def test_min_pairs(self):\n \"\"\"Test with the minimum number of pairs, which is 1.\"\"\"\n random.seed(4)\n ax = f_902(1)\n displayed_pairs = len(set(tick.get_text() for tick in ax.get_xticklabels()))\n self.assertEqual(displayed_pairs, 1)", "apis": ["itertools.product", "seaborn.countplot", "matplotlib.pyplot.xticks"], "libs": ["itertools", "seaborn", "matplotlib"], "doc": {"description": ["Generate and display a countplot of predefined shape-color pairs.", "This function creates a visual representation of a specified number of unique shape-color combinations,", "each displayed as a bar in the countplot. The shape-color pairs are selected from a predefined list."], "note": [], "params": ["num_pairs (int): The number of unique shape-color pairs to be displayed in the countplot.", "Default is 10. If the requested number is less than 1 or greater than the total", "possible unique combinations (100), it is adjusted to the valid range (1 to 100)."], "returns": ["ax (matplotlib.axes._subplots.Axes): The Axes object of the countplot, which can be used for", "further customizations or to retrieve information about the plot."], "reqs": ["itertools", "seaborn", "matplotlib"], "raises": [], "example": [">>> ax = f_902(10)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']", ">>> ax = f_902(9)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']", ">>> ax = f_902(8)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']", ">>> ax = f_902(7)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']", ">>> ax = f_902(6)", ">>> [tick.get_text() for tick in ax.get_xticklabels()]", "['Circle:Red', 'Circle:Blue', 'Circle:Green', 'Circle:Yellow', 'Circle:Black', 'Circle:White', 'Circle:Purple', 'Circle:Orange', 'Circle:Pink', 'Circle:Brown']"]}} +{"task_id": "f_846", "prompt": "import urllib.request\nfrom lxml import etree\nimport pandas as pd\n\n\ndef f_846(url):\n \"\"\"\n Fetches and parses an XML file from a specified URL, then converts it into a Pandas DataFrame.\n\n Parameters:\n url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL.\n \n Returns:\n pandas.DataFrame\n A DataFrame constructed from the parsed XML data. Each row of the DataFrame corresponds to an 'item' element\n in the XML file, with child elements of 'item' becoming columns in the DataFrame.\n\n Raises:\n ValueError\n This error is raised in several scenarios:\n 1. If the URL is invalid or the XML file cannot be fetched from the URL.\n 2. If the XML file has invalid syntax.\n 3. If the XML structure does not conform to the expected format.\n\n Requirements:\n - urllib\n - lxml\n - pandas\n\n Examples:\n # Example with a valid XML structure\n >>> df = f_846('http://example.com/sample_data.xml')\n >>> print(df)\n name age\n 0 John 25\n 1 Jane 30\n\n # Example with an invalid XML structure\n >>> df = f_846('http://example.com/invalid_structure.xml')\n ValueError: XML structure does not match expected format.\n \"\"\"", "canonical_solution": " try:\n with urllib.request.urlopen(url) as response:\n xml_data = response.read()\n except Exception as e:\n raise ValueError(f\"Error fetching the XML file: {e}\")\n\n try:\n xml_tree = etree.XML(xml_data)\n except etree.XMLSyntaxError:\n raise ValueError(\"Invalid XML syntax\")\n\n data = []\n for item in xml_tree.findall(\".//item\"):\n data_item = {child.tag: child.text for child in item}\n data.append(data_item)\n\n if not data:\n raise ValueError(\"XML structure does not match expected format.\")\n\n return pd.DataFrame(data)", "test": "import unittest\nimport pandas as pd\nfrom unittest.mock import patch\nclass TestCases(unittest.TestCase):\n \"\"\"Test cases for the f_846 function.\"\"\"\n @patch(\"urllib.request.urlopen\")\n def test_valid_xml(self, mock_urlopen):\n \"\"\"Test that the function returns the correct DataFrame for a given XML file.\"\"\"\n # Mocking the XML data\n valid_xml_data = b\"John25Jane30\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n valid_xml_data\n )\n url = \"http://example.com/sample_data.xml\"\n expected_df = pd.DataFrame({\"name\": [\"John\", \"Jane\"], \"age\": [\"25\", \"30\"]})\n result_df = f_846(url)\n pd.testing.assert_frame_equal(result_df, expected_df)\n @patch(\"urllib.request.urlopen\")\n def test_empty_xml(self, mock_urlopen):\n \"\"\"Test that the function raises an error for an empty XML file.\"\"\"\n # Mocking empty XML data\n empty_xml_data = b\"\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n empty_xml_data\n )\n url = \"http://example.com/empty_data.xml\"\n with self.assertRaises(ValueError):\n f_846(url)\n @patch(\"urllib.request.urlopen\")\n def test_different_structure_xml(self, mock_urlopen):\n \"\"\"Test that the function raises an error for an XML file with a different structure.\"\"\"\n # Mocking XML with different structure\n different_structure_xml = (\n b\"John\"\n )\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n different_structure_xml\n )\n url = \"http://example.com/different_structure_data.xml\"\n with self.assertRaises(ValueError):\n f_846(url)\n @patch(\"urllib.request.urlopen\")\n def test_invalid_url(self, mock_urlopen):\n \"\"\"Test that the function raises an error for an invalid URL.\"\"\"\n # Simulate an error in URL fetching\n mock_urlopen.side_effect = Exception(\"URL fetch error\")\n url = \"http://example.com/nonexistent/file.xml\"\n with self.assertRaises(ValueError):\n f_846(url)\n @patch(\"urllib.request.urlopen\")\n def test_non_xml_data(self, mock_urlopen):\n \"\"\"Test that the function raises an error for non-XML data.\"\"\"\n # Mocking non-XML data\n non_xml_data = b\"Not an XML content\"\n mock_urlopen.return_value.__enter__.return_value.read.return_value = (\n non_xml_data\n )\n url = \"http://example.com/non_xml_data.txt\"\n with self.assertRaises(ValueError):\n f_846(url)", "apis": ["urllib.request", "lxml.etree.XML", "pandas.DataFrame", "urllib.request.urlopen", "lxml.etree.XMLSyntaxError"], "libs": ["urllib", "pandas", "lxml"], "doc": {"description": ["Fetches and parses an XML file from a specified URL, then converts it into a Pandas DataFrame.", "# Example with an invalid XML structure", ">>> df = f_846('http://example.com/invalid_structure.xml')", "ValueError: XML structure does not match expected format."], "note": [], "params": ["url (str): The URL of the CSV file to be downloaded. Must be a valid and accessible URL."], "returns": ["pandas.DataFrame", "A DataFrame constructed from the parsed XML data. Each row of the DataFrame corresponds to an 'item' element", "in the XML file, with child elements of 'item' becoming columns in the DataFrame."], "reqs": ["urllib", "lxml", "pandas"], "raises": ["ValueError", "This error is raised in several scenarios:", "1. If the URL is invalid or the XML file cannot be fetched from the URL.", "2. If the XML file has invalid syntax.", "3. If the XML structure does not conform to the expected format."], "example": ["Examples:", "# Example with a valid XML structure", ">>> df = f_846('http://example.com/sample_data.xml')", ">>> print(df)", "name age", "0 John 25", "1 Jane 30"]}} +{"task_id": "f_581", "prompt": "import pandas as pd\nfrom sklearn.model_selection import train_test_split\n\n\ndef f_581(df):\n \"\"\"\n Divide the given DataFrame into a training set and a test set (70%: 30% split), separate the \"target\" column and return the four resulting DataFrames.\n\n Parameters:\n - df (pd.DataFrame): pandas DataFrame that contains a column named 'target'.\n\n Returns:\n - tuple: A tuple containing four DataFrames: X_train, X_test, y_train, y_test.\n\n Requirements:\n - pandas\n - sklearn\n \n Example:\n >>> np.random.seed(42) # Ensure reproducibility\n >>> df = pd.DataFrame(np.random.randint(0, 100, size=(100, 5)), columns=list('ABCDE')) # Explicitly using np and pd\n >>> df['target'] = np.random.randint(0, 2, size=100) # Adding 'target' column using np\n >>> X_train, X_test, y_train, y_test = f_581(df)\n >>> print(X_train.shape) # Expected shape of training data\n (70, 5)\n \"\"\"", "canonical_solution": " X = pd.DataFrame.drop(df, 'target', axis=1)\n y = pd.DataFrame(df['target'])\n\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n\n return X_train, X_test, y_train, y_test", "test": "import unittest\nimport numpy as np\nclass TestCases(unittest.TestCase):\n def test_case_1(self):\n df = pd.DataFrame(np.random.randint(0, 100, size=(100, 5)), columns=list('ABCDE'))\n df['target'] = np.random.randint(0, 2, size=100)\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (70, 5))\n self.assertEqual(X_test.shape, (30, 5))\n self.assertEqual(y_train.shape, (70, 1))\n self.assertEqual(y_test.shape, (30, 1))\n def test_case_2(self):\n df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'target': [0, 1, 0]})\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (2, 2))\n self.assertEqual(X_test.shape, (1, 2))\n self.assertEqual(y_train.shape, (2, 1))\n self.assertEqual(y_test.shape, (1, 1))\n def test_case_3(self):\n df = pd.DataFrame({'A': [0, 0, 0], 'B': [0, 0, 0], 'target': [0, 0, 0]})\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (2, 2))\n self.assertEqual(X_test.shape, (1, 2))\n self.assertEqual(y_train.shape, (2, 1))\n self.assertEqual(y_test.shape, (1, 1))\n self.assertEqual(X_train.iloc[0, 0], 0)\n self.assertEqual(X_train.iloc[0, 1], 0)\n self.assertEqual(X_train.iloc[1, 0], 0)\n self.assertEqual(X_train.iloc[1, 1], 0)\n self.assertEqual(X_test.iloc[0, 0], 0)\n self.assertEqual(X_test.iloc[0, 1], 0)\n self.assertEqual(y_train.iloc[0].to_list(), [0])\n self.assertEqual(y_train.iloc[1].to_list(), [0])\n self.assertEqual(y_test.iloc[0].to_list(), [0])\n def test_case_4(self):\n df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'target': [1, 1, 1]})\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (2, 2))\n self.assertEqual(X_test.shape, (1, 2))\n self.assertEqual(y_train.shape, (2, 1))\n self.assertEqual(y_test.shape, (1, 1))\n \n def test_case_5(self):\n df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'target': [0, 0, 0]})\n X_train, X_test, y_train, y_test = f_581(df)\n self.assertEqual(X_train.shape, (2, 2))\n self.assertEqual(X_test.shape, (1, 2))\n self.assertEqual(y_train.shape, (2, 1))\n self.assertEqual(y_test.shape, (1, 1))", "apis": ["pandas.DataFrame", "sklearn.model_selection.train_test_split", "pandas.DataFrame.drop"], "libs": ["pandas", "sklearn"], "doc": {"description": ["Divide the given DataFrame into a training set and a test set (70%: 30% split), separate the \"target\" column and return the four resulting DataFrames."], "note": [], "params": ["df (pd.DataFrame): pandas DataFrame that contains a column named 'target'."], "returns": ["tuple: A tuple containing four DataFrames: X_train, X_test, y_train, y_test."], "reqs": ["pandas", "sklearn"], "raises": [], "example": [">>> np.random.seed(42) # Ensure reproducibility", ">>> df = pd.DataFrame(np.random.randint(0, 100, size=(100, 5)), columns=list('ABCDE')) # Explicitly using np and pd", ">>> df['target'] = np.random.randint(0, 2, size=100) # Adding 'target' column using np", ">>> X_train, X_test, y_train, y_test = f_581(df)", ">>> print(X_train.shape) # Expected shape of training data", "(70, 5)"]}} +{"task_id": "f_877", "prompt": "import pandas as pd\nimport numpy as np\n\n\nCATEGORIES = [\"Electronics\", \"Clothing\", \"Home Decor\", \"Automotive\", \"Books\"]\n\n\ndef f_877(s1, s2):\n \"\"\"\n Compares and visualizes the sales data of two stores for predefined categories.\n The function generates a bar plot for categories where both stores have sales exceeding a specified threshold.\n The Euclidean distance between the two series is also computed.\n \n Parameters:\n s1 (pd.Series): Sales data for store 1, indexed by categories.\n s2 (pd.Series): Sales data for store 2, indexed by categories.\n\n Returns:\n matplotlib.axes.Axes or None: A bar plot for categories where both stores' sales exceed the threshold of 200,\n or None if no such categories exist.\n float: The Euclidean distance between the two series or 0.0 if no categories meet the threshold.\n\n Requirements:\n - pandas\n - numpy\n\n Example:\n >>> np.random.seed(seed=32)\n >>> s1 = pd.Series(np.random.randint(100, 500, size=5), index=CATEGORIES)\n >>> s2 = pd.Series(np.random.randint(150, 600, size=5), index=CATEGORIES)\n >>> ax, edit_distance = f_877(s1, s2)\n >>> ax.get_title()\n 'Sales Comparison Above Threshold in Categories'\n >>> edit_distance\n 387.5590277622236\n \"\"\"", "canonical_solution": "\n # Determine categories where both stores exceed the sales threshold\n high_sales_categories = s1.index[(s1 > 200) & (s2 > 200)]\n\n if high_sales_categories.empty:\n return None, 0.0\n\n # Prepare the data for plotting\n df = pd.DataFrame(\n {\"Store 1\": s1[high_sales_categories], \"Store 2\": s2[high_sales_categories]}\n )\n\n # compute the edit distance between the two series\n edit_distance = np.linalg.norm(df[\"Store 1\"] - df[\"Store 2\"])\n \n # Generate the bar plot\n ax = df.plot(kind=\"bar\", title=\"Sales Comparison Above Threshold in Categories\")\n return ax, edit_distance", "test": "import pandas as pd\nimport numpy as np\nimport unittest\nimport matplotlib.pyplot as plt\n# Constants (should be kept consistent with function.py)\nCATEGORIES = [\"Electronics\", \"Clothing\", \"Home Decor\", \"Automotive\", \"Books\"]\nclass TestCases(unittest.TestCase):\n \"\"\"Tests for function f_877.\"\"\"\n def test_sales_above_threshold(self):\n \"\"\"Test that the function returns a plot when sales exceed the threshold\"\"\"\n np.random.seed(seed=32)\n s1 = pd.Series(np.random.randint(100, 500, size=5), index=CATEGORIES)\n np.random.seed(seed=32)\n s2 = pd.Series(np.random.randint(150, 600, size=5), index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check the correct categories are plotted\n categories_plotted = [label.get_text() for label in ax.get_xticklabels()]\n self.assertListEqual(\n categories_plotted, [\"Electronics\", \"Home Decor\", \"Automotive\", \"Books\"]\n )\n # Check the title of the plot\n self.assertEqual(\n ax.get_title(), \"Sales Comparison Above Threshold in Categories\"\n )\n self.assertAlmostEqual(edit_distance, 100.0)\n \n def test_no_sales_above_threshold(self):\n \"\"\"Test that no categories are plotted when no sales exceed the threshold\"\"\"\n np.random.seed(seed=32)\n s1 = pd.Series(np.random.randint(50, 150, size=5), index=CATEGORIES)\n np.random.seed(seed=32)\n s2 = pd.Series(np.random.randint(50, 150, size=5), index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check that no categories are plotted\n self.assertIsNone(\n ax, \"Expected None as no categories should meet the threshold\"\n )\n self.assertAlmostEqual(edit_distance, 0.0)\n def test_all_sales_above_threshold(self):\n \"\"\"Test that all categories are plotted when all sales exceed the threshold\"\"\"\n np.random.seed(seed=123)\n s1 = pd.Series(np.random.randint(200, 500, size=5), index=CATEGORIES)\n np.random.seed(seed=123)\n s2 = pd.Series(np.random.randint(250, 600, size=5), index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check that all categories are plotted\n categories_plotted = [label.get_text() for label in ax.get_xticklabels()]\n self.assertListEqual(categories_plotted, CATEGORIES)\n self.assertAlmostEqual(edit_distance, 389.8127755730948)\n \n def test_some_sales_above_threshold(self):\n \"\"\"Test that some categories are plotted when some sales exceed the threshold\"\"\"\n s1 = pd.Series([250, 180, 290, 200, 290], index=CATEGORIES)\n s2 = pd.Series([260, 290, 195, 299, 295], index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check that only the correct categories are plotted\n categories_plotted = [label.get_text() for label in ax.get_xticklabels()]\n self.assertListEqual(categories_plotted, [\"Electronics\", \"Books\"])\n self.assertAlmostEqual(edit_distance, 11.180339887498949)\n \n def test_single_sales_above_threshold(self):\n \"\"\"Test that only a single category is plotted when only a single category has sales exceeding the threshold\"\"\"\n s1 = pd.Series([150, 180, 290, 200, 190], index=CATEGORIES)\n s2 = pd.Series([160, 190, 295, 199, 195], index=CATEGORIES)\n ax, edit_distance = f_877(s1, s2)\n # Check that only a single category is plotted\n categories_plotted = [label.get_text() for label in ax.get_xticklabels()]\n self.assertListEqual(categories_plotted, [\"Home Decor\"])\n self.assertAlmostEqual(edit_distance, 5.0)\n \n def tearDown(self):\n plt.close()", "apis": ["pandas.DataFrame", "numpy.linalg", "numpy.linalg.norm"], "libs": ["numpy", "pandas"], "doc": {"description": ["Compares and visualizes the sales data of two stores for predefined categories.", "The function generates a bar plot for categories where both stores have sales exceeding a specified threshold.", "The Euclidean distance between the two series is also computed."], "note": [], "params": ["s1 (pd.Series): Sales data for store 1, indexed by categories.", "s2 (pd.Series): Sales data for store 2, indexed by categories."], "returns": ["matplotlib.axes.Axes or None: A bar plot for categories where both stores' sales exceed the threshold of 200,", "or None if no such categories exist.", "float: The Euclidean distance between the two series or 0.0 if no categories meet the threshold."], "reqs": ["pandas", "numpy"], "raises": [], "example": [">>> np.random.seed(seed=32)", ">>> s1 = pd.Series(np.random.randint(100, 500, size=5), index=CATEGORIES)", ">>> s2 = pd.Series(np.random.randint(150, 600, size=5), index=CATEGORIES)", ">>> ax, edit_distance = f_877(s1, s2)", ">>> ax.get_title()", "'Sales Comparison Above Threshold in Categories'", ">>> edit_distance", "387.5590277622236"]}} diff --git a/data/processed/f_331_jenny_w_doc.py b/data/processed/f_331_jenny_w_doc.py index 7752158f..f7cb8f61 100644 --- a/data/processed/f_331_jenny_w_doc.py +++ b/data/processed/f_331_jenny_w_doc.py @@ -14,7 +14,7 @@ def f_331(data, column="c"): - column (str): Name of column to remove. Defaults to "c". Returns: - - matplotlib.axes._subplots.AxesSubplot or None: The Axes object of the heatmap + - matplotlib.axes._subplots.Axes or None: The Axes object of the heatmap or None if the heatmap is not generated. Requirements: @@ -23,7 +23,7 @@ def f_331(data, column="c"): Example: >>> f_331({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - + >>> f_331(pd.DataFrame({'a': ["foo", "bar"]})) """ df = pd.DataFrame(data) diff --git a/data/processed/f_336_jenny_w_doc.py b/data/processed/f_336_jenny_w_doc.py index 82914a2e..16c5f894 100644 --- a/data/processed/f_336_jenny_w_doc.py +++ b/data/processed/f_336_jenny_w_doc.py @@ -14,7 +14,7 @@ def f_336(df1, df2): Returns: - tuple: A tuple containing: - list: A list of the selected features. - - AxesSubplot: A heatmap showing the correlation between the selected features. + - Axes: A heatmap showing the correlation between the selected features. Requirements: - pandas diff --git a/data/processed/f_367_jenny_wo_doc.py b/data/processed/f_367_jenny_wo_doc.py index 7fd74a5c..277a5a30 100644 --- a/data/processed/f_367_jenny_wo_doc.py +++ b/data/processed/f_367_jenny_wo_doc.py @@ -29,7 +29,7 @@ def f_367(file_path="data.csv", columns=["A", "B", "C"]): 0 1.0 2.0 3.0 1 4.0 5.0 6.0 >>> ax - + >>> croot 0 1.0 """ diff --git a/data/processed/f_407_jenny_wo_doc.py b/data/processed/f_407_jenny_wo_doc.py index e6d5b187..6c7b1983 100644 --- a/data/processed/f_407_jenny_wo_doc.py +++ b/data/processed/f_407_jenny_wo_doc.py @@ -14,7 +14,7 @@ def f_407(data): are not the expected type, this function raises TypeError. Returns: - - matplotlib.axes._subplots.AxesSubplot: The generated plot's Axes object. + - matplotlib.axes._subplots.Axes: The generated plot's Axes object. Requirements: - pandas diff --git a/data/processed/f_411_jenny_w_doc.py b/data/processed/f_411_jenny_w_doc.py index 5c5ecc0b..341efd90 100644 --- a/data/processed/f_411_jenny_w_doc.py +++ b/data/processed/f_411_jenny_w_doc.py @@ -10,7 +10,7 @@ def f_411(data): data (list): A list of dictionaries. The keys are labels and the values are data points. Returns: - matplotlib.axes._subplots.AxesSubplot or None: Axes object of the plot showing 'Data over Time', + matplotlib.axes._subplots.Axes or None: Axes object of the plot showing 'Data over Time', with 'Time' on the x-axis and 'Data Points' on the y-axis. If data is empty, return None. diff --git a/data/processed/f_413_jenny_wo_doc.py b/data/processed/f_413_jenny_wo_doc.py index 4266eb3f..0e83d5c2 100644 --- a/data/processed/f_413_jenny_wo_doc.py +++ b/data/processed/f_413_jenny_wo_doc.py @@ -16,7 +16,7 @@ def f_413(input_file): Returns: - result (dict): each key corresponds to those in the input dictionaries, and the corresponding value is another dict with keys 'mean' and 'median', representing the calculated statistics. - - plots (list[matplotlib.axes._subplots.AxesSubplot]): A list of bar charts, one for + - plots (list[matplotlib.axes._subplots.Axes]): A list of bar charts, one for each key in the dictionaries, visualizing the mean and median values. Requirements: diff --git a/data/processed/f_423_jenny_wo_doc.py b/data/processed/f_423_jenny_wo_doc.py index bd439964..f6ba6341 100644 --- a/data/processed/f_423_jenny_wo_doc.py +++ b/data/processed/f_423_jenny_wo_doc.py @@ -13,7 +13,7 @@ def f_423(db_name="test.db", table_name="People"): table_name (str, optional): The name of the table to plot from. Defaults to 'People'. Returns: - matplotlib.axes._subplots.AxesSubplot: Axes object representing the age distribution plot, + matplotlib.axes._subplots.Axes: Axes object representing the age distribution plot, with x-axis showing age and a default of bins=30, kde=True. Requirements: diff --git a/data/processed/f_746_wenhao_w_doc.py b/data/processed/f_746_wenhao_w_doc.py index 5e6ec580..6dc59955 100644 --- a/data/processed/f_746_wenhao_w_doc.py +++ b/data/processed/f_746_wenhao_w_doc.py @@ -20,11 +20,11 @@ def f_746(d, keys=['x', 'y', 'z']): >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}] >>> ax = f_746(data) >>> type(ax) - + >>> ax = f_746(data, keys=['x', 'y']) >>> type(ax) - + """ # Convert the list of dictionaries to a DataFrame df = pd.DataFrame(d) diff --git a/data/processed/f_750_wenhao_wo_doc.py b/data/processed/f_750_wenhao_wo_doc.py index db0d66eb..1303b1e7 100644 --- a/data/processed/f_750_wenhao_wo_doc.py +++ b/data/processed/f_750_wenhao_wo_doc.py @@ -17,7 +17,7 @@ def f_750(directory: str, pattern: str) -> list: - pattern (str): The regular expression pattern to match the filenames. Returns: - - A list of matplotlib.axes._subplots.AxesSubplot objects, each representing a plot of sales data from a matched CSV file. + - A list of matplotlib.axes._subplots.Axes objects, each representing a plot of sales data from a matched CSV file. Example usage: >>> axes = f_750('/path/to/data/', r'^sales_data_\d{4}.csv') diff --git a/data/processed/f_752_wenhao_w_doc.py b/data/processed/f_752_wenhao_w_doc.py index 6645a529..1376d7a5 100644 --- a/data/processed/f_752_wenhao_w_doc.py +++ b/data/processed/f_752_wenhao_w_doc.py @@ -31,7 +31,7 @@ def f_752(letters, repetitions, colors): Example: >>> ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue']) >>> type(ax) - + """ if len(letters) != len(repetitions) or len(letters) != len(colors) or len(letters) == 0: raise ValueError("All lists must be the same length and non-empty.") diff --git a/data/processed/f_757_wenhao_w_doc.py b/data/processed/f_757_wenhao_w_doc.py index e3231ced..152beea4 100644 --- a/data/processed/f_757_wenhao_w_doc.py +++ b/data/processed/f_757_wenhao_w_doc.py @@ -14,7 +14,7 @@ def f_757(df, z_threshold=2): Returns: tuple: A tuple containing the following elements: - pandas.DataFrame: A DataFrame containing the outliers in the 'closing_price' column. - - matplotlib.axes._subplots.AxesSubplot: The plot object displaying the outliers. + - matplotlib.axes._subplots.Axes: The plot object displaying the outliers. Requirements: - pandas diff --git a/data/processed/f_758_wenhao_w_doc.py b/data/processed/f_758_wenhao_w_doc.py index f6e784b1..3e8edf00 100644 --- a/data/processed/f_758_wenhao_w_doc.py +++ b/data/processed/f_758_wenhao_w_doc.py @@ -21,7 +21,7 @@ def f_758(df: pd.DataFrame) -> tuple: with stock closing prices. Returns: - tuple: A tuple containing two matplotlib.axes._subplots.AxesSubplot objects: the first for the boxplot + tuple: A tuple containing two matplotlib.axes._subplots.Axes objects: the first for the boxplot and the second for the histogram. Example: diff --git a/data/processed/f_761_wenhao_w_doc.py b/data/processed/f_761_wenhao_w_doc.py index aa9714c7..1d88a870 100644 --- a/data/processed/f_761_wenhao_w_doc.py +++ b/data/processed/f_761_wenhao_w_doc.py @@ -16,7 +16,7 @@ def f_761(df, column): - column (str): The name of the column in the DataFrame that contains the categories. Output: - - matplotlib.axes._subplots.AxesSubplot: The Axes object for the generated plot. + - matplotlib.axes._subplots.Axes: The Axes object for the generated plot. Requirements: - pandas @@ -30,7 +30,6 @@ def f_761(df, column): >>> df = pd.DataFrame({'Type': ['A', 'A', 'C', 'E', 'D', 'E', 'D']}) >>> ax = f_761(df, 'Type') - # This generates and displays a bar chart showing the distribution of each category within the 'Type' column, including categories with zero occurrences. """ # Define the categories CATEGORIES = ['A', 'B', 'C', 'D', 'E'] diff --git a/data/processed/f_762_wenhao_w_doc.py b/data/processed/f_762_wenhao_w_doc.py index 53496317..cc16839d 100644 --- a/data/processed/f_762_wenhao_w_doc.py +++ b/data/processed/f_762_wenhao_w_doc.py @@ -11,7 +11,7 @@ def f_762(df): df (pandas.DataFrame): The DataFrame containing numerical columns to be used for correlation. Returns: - matplotlib.axes._subplots.AxesSubplot: The matplotlib Axes object representing the heatmap. + matplotlib.axes._subplots.Axes: The matplotlib Axes object representing the heatmap. Requirements: - pandas @@ -22,7 +22,7 @@ def f_762(df): >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) >>> ax = f_762(df) >>> type(ax) - + """ correlation_matrix = df.corr() diff --git a/data/processed/f_765_wenhao_w_doc.py b/data/processed/f_765_wenhao_w_doc.py index a868ee5c..3bbe05c7 100644 --- a/data/processed/f_765_wenhao_w_doc.py +++ b/data/processed/f_765_wenhao_w_doc.py @@ -24,10 +24,11 @@ def f_765(person_names, email_domains, num_records=5): - ValueError: If the number of names provided is less than the number of records requested or if no email domains are provided. Example: + >>> random.seed(0) # Initialize random seed >>> f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2) Name Email - 0 John Doe john[at]yahoo.com - 1 Jane Smith jane[at]gmail.com + 0 Jane Smith jane[at]gmail.com + 1 John Doe john[at]yahoo.com >>> f_765(['Alice'], ['outlook.com'], 1) Name Email 0 Alice alice[at]outlook.com diff --git a/data/processed/f_770_wenhao_w_doc.py b/data/processed/f_770_wenhao_w_doc.py index 7bb3364c..fef7a36d 100644 --- a/data/processed/f_770_wenhao_w_doc.py +++ b/data/processed/f_770_wenhao_w_doc.py @@ -26,8 +26,8 @@ def f_770(word: str) -> dict: - The function uses the `string` library to get a string of lowercase alphabets. Example: - >>> f_770('abcdef') - {'ab': 1, 'ac': 0, 'ad': 0, ..., 'yx': 0, 'yz': 0, 'za': 0, ..., 'zx': 0, 'zy': 0} + >>> list(f_770('abcdef').items())[:5] + [('ab', 1), ('ac', 0), ('ad', 0), ('ae', 0), ('af', 0)] """ ALPHABETS = string.ascii_lowercase # Generate all two-letter combinations of alphabets diff --git a/data/processed/f_778_wenhao_w_doc.py b/data/processed/f_778_wenhao_w_doc.py index 4598568c..2150e468 100644 --- a/data/processed/f_778_wenhao_w_doc.py +++ b/data/processed/f_778_wenhao_w_doc.py @@ -15,7 +15,7 @@ def f_778(word): Should contain only lowercase alphabetic characters. Returns: - AxesSubplot: A matplotlib.axes._subplots.AxesSubplot object representing the generated plot. + Axes: A matplotlib.axes._subplots.Axes object representing the generated plot. Requirements: - numpy diff --git a/data/processed/f_798_wenhao_wo_doc.py b/data/processed/f_798_wenhao_wo_doc.py index 1b5953c1..428e7863 100644 --- a/data/processed/f_798_wenhao_wo_doc.py +++ b/data/processed/f_798_wenhao_wo_doc.py @@ -32,7 +32,7 @@ def f_798(mystrings, text): Examples: >>> ax = f_798(['Lorem ipsum', 'consectetur adipiscing'], 'Lorem ipsum dolor sit amet lorem Ipsum') >>> type(ax) - + """ if not text: diff --git a/data/processed/f_810_wenhao_w_doc.py b/data/processed/f_810_wenhao_w_doc.py index 37b9f6fd..6d73f9c4 100644 --- a/data/processed/f_810_wenhao_w_doc.py +++ b/data/processed/f_810_wenhao_w_doc.py @@ -27,7 +27,7 @@ def f_810(func, x_range=(-2, 2), num_points=1000): Example: >>> ax = f_810(np.sin) >>> type(ax) - + >>> ax.get_legend_handles_labels()[-1] ['sin(x)', 'Integral of sin(x)'] """ diff --git a/data/processed/f_811_wenhao_w_doc.py b/data/processed/f_811_wenhao_w_doc.py index a53abcb5..dbeba060 100644 --- a/data/processed/f_811_wenhao_w_doc.py +++ b/data/processed/f_811_wenhao_w_doc.py @@ -11,7 +11,7 @@ def f_811(df): - df (pandas.DataFrame): A DataFrame with numerical values. Returns: - - matplotlib.axes._subplots.AxesSubplot: The AxesSubplot object of the Seaborn heatmap. + - matplotlib.axes._subplots.Axes: The Axes object of the Seaborn heatmap. Raises: - ValueError: If the DataFrame is empty or if no numeric columns are present. @@ -55,7 +55,7 @@ def test_non_numeric_columns_ignored(self): df = pd.DataFrame({"A": [1, 2, 3], "B": ["one", "two", "three"]}) ax = f_811(df) self.assertIsInstance( - ax, plt.Axes, "The result should be a matplotlib AxesSubplot object" + ax, plt.Axes, "The result should be a matplotlib Axes object" ) self.assertEqual( len(ax.get_xticklabels()), 1, "Non-numeric columns should be ignored" @@ -64,25 +64,25 @@ def test_with_positive_numbers(self): df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) result = f_811(df) self.assertIsInstance( - result, plt.Axes, "The result should be a matplotlib AxesSubplot object" + result, plt.Axes, "The result should be a matplotlib Axes object" ) def test_with_negative_numbers(self): df = pd.DataFrame({"A": [-1, -2, -3], "B": [-4, -5, -6]}) result = f_811(df) self.assertIsInstance( - result, plt.Axes, "The result should be a matplotlib AxesSubplot object" + result, plt.Axes, "The result should be a matplotlib Axes object" ) def test_with_mixed_numbers(self): df = pd.DataFrame({"A": [1, -2, 3], "B": [-4, 5, -6]}) result = f_811(df) self.assertIsInstance( - result, plt.Axes, "The result should be a matplotlib AxesSubplot object" + result, plt.Axes, "The result should be a matplotlib Axes object" ) def test_with_zeroes(self): df = pd.DataFrame({"A": [0, 0, 0], "B": [0, 0, 0]}) result = f_811(df) self.assertIsInstance( - result, plt.Axes, "The result should be a matplotlib AxesSubplot object" + result, plt.Axes, "The result should be a matplotlib Axes object" ) def test_with_empty_dataframe(self): df = pd.DataFrame({"A": [], "B": []}) diff --git a/data/processed/f_820_wenhao_w_doc.py b/data/processed/f_820_wenhao_w_doc.py index bfb46d6d..77b1567d 100644 --- a/data/processed/f_820_wenhao_w_doc.py +++ b/data/processed/f_820_wenhao_w_doc.py @@ -36,7 +36,7 @@ def f_820(array, features=None, seed=None): >>> array = np.random.rand(2, 5) >>> ax = f_820(array, features=['A', 'B', 'C', 'D', 'E'], seed=1) >>> type(ax) - + >>> ax.collections[0].get_array().data.flatten() array([0.60276338, 0.71518937, 0.4236548 , 0.5488135 , 0.54488318, 0.891773 , 0.43758721, 0.38344152, 0.64589411, 0.96366276]) diff --git a/data/processed/f_827_wenhao_w_doc.py b/data/processed/f_827_wenhao_w_doc.py index 4ce90f18..af48ac77 100644 --- a/data/processed/f_827_wenhao_w_doc.py +++ b/data/processed/f_827_wenhao_w_doc.py @@ -13,7 +13,7 @@ def f_827(df, x_column, y_column): y_column (str): The column name for the y-axis. Data contained in column must be numeric. Returns: - matplotlib.axes._subplots.AxesSubplot: The Axes object containing the scatter plot and the linear regression line. + matplotlib.axes._subplots.Axes: The Axes object containing the scatter plot and the linear regression line. Requirements: - pandas @@ -28,7 +28,7 @@ def f_827(df, x_column, y_column): >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) >>> ax = f_827(df, 'A', 'B') >>> type(ax) - + """ X = df[x_column].values.reshape(-1, 1) Y = df[y_column].values diff --git a/data/processed/f_830_wenhao_w_doc.py b/data/processed/f_830_wenhao_w_doc.py index 232f5e0c..cacf6bef 100644 --- a/data/processed/f_830_wenhao_w_doc.py +++ b/data/processed/f_830_wenhao_w_doc.py @@ -38,7 +38,7 @@ def f_830(json_data: str, data_key: str): >>> json_str = '{"data": {"values": [5, 10, 15, 20, 25]}}' >>> original_data, normalized_data, ax = f_830(json_str, 'data.values') >>> type(original_data), type(normalized_data), type(ax) - (, , ) + (, , ) """ data = json.loads(json_data) try: diff --git a/data/processed/f_836_chien_w_doc.py b/data/processed/f_836_chien_w_doc.py index c10fb448..70b005d4 100644 --- a/data/processed/f_836_chien_w_doc.py +++ b/data/processed/f_836_chien_w_doc.py @@ -15,7 +15,7 @@ def f_836(text): of characters and punctuation. Returns: - matplotlib.axes._subplots.AxesSubplot: An Axes object showing the histogram and optionally the KDE + matplotlib.axes._subplots.Axes: An Axes object showing the histogram and optionally the KDE plot of word lengths. This visual representation helps in understanding the distribution of word lengths in the given text. diff --git a/data/processed/f_857_chien_wo_doc.py b/data/processed/f_857_chien_wo_doc.py index 0abd8c9d..fbcfc97f 100644 --- a/data/processed/f_857_chien_wo_doc.py +++ b/data/processed/f_857_chien_wo_doc.py @@ -15,7 +15,7 @@ def f_857(api_url): Returns: - DataFrame: A pandas DataFrame with the parsed data from the API. - - AxesSubplot or None: A matplotlib AxesSubplot object representing the plot of the data, or None if the data is empty. + - Axes or None: A matplotlib Axes object representing the plot of the data, or None if the data is empty. Raises: - HTTPError: If the API request fails due to issues like network problems, invalid response, etc. diff --git a/data/processed/f_867_chien_w_doc.py b/data/processed/f_867_chien_w_doc.py index c345f9da..57db117a 100644 --- a/data/processed/f_867_chien_w_doc.py +++ b/data/processed/f_867_chien_w_doc.py @@ -19,7 +19,7 @@ def f_867(data_dict): Returns: - DataFrame: A pandas DataFrame created from the input dictionary, excluding None values. - - AxesSubplot or None: A seaborn histogram plot object if the DataFrame contains variable data; + - Axes or None: A seaborn histogram plot object if the DataFrame contains variable data; None if the DataFrame is empty or if all values are identical. Requirements: diff --git a/data/processed/f_875_chien_w_doc.py b/data/processed/f_875_chien_w_doc.py index 895cfee9..25519c6e 100644 --- a/data/processed/f_875_chien_w_doc.py +++ b/data/processed/f_875_chien_w_doc.py @@ -30,7 +30,7 @@ def f_875(rows=1000, string_length=3): Default is 3. A value of 0 results in the generation of empty strings. Returns: - - matplotlib.axes._subplots.AxesSubplot or None: A seaborn heatmap plot object if + - matplotlib.axes._subplots.Axes or None: A seaborn heatmap plot object if data is generated; otherwise, None. Requirements: diff --git a/data/processed/f_887_chien_w_doc.py b/data/processed/f_887_chien_w_doc.py index ae0494bc..68e82d3f 100644 --- a/data/processed/f_887_chien_w_doc.py +++ b/data/processed/f_887_chien_w_doc.py @@ -15,7 +15,7 @@ def f_887(data_list): - data_list (list): A list containing category labels (strings). Returns: - - Axes object (matplotlib.axes._subplots.AxesSubplot): The histogram displaying the distribution of categories. + - Axes object (matplotlib.axes._subplots.Axes): The histogram displaying the distribution of categories. Requirements: - pandas diff --git a/data/processed/f_891_chien_wo_doc.py b/data/processed/f_891_chien_wo_doc.py index f30d53a8..0acb05e9 100644 --- a/data/processed/f_891_chien_wo_doc.py +++ b/data/processed/f_891_chien_wo_doc.py @@ -22,7 +22,7 @@ def f_891(date_str): Example: >>> ax = f_891('2023-06-15') >>> type(ax) - + """ date = datetime.strptime(date_str, "%Y-%m-%d") num_of_values = date.day diff --git a/data/processed/f_895_chien_w_doc.py b/data/processed/f_895_chien_w_doc.py index 5e30127a..4cba52c7 100644 --- a/data/processed/f_895_chien_w_doc.py +++ b/data/processed/f_895_chien_w_doc.py @@ -13,7 +13,7 @@ def f_895(data_dict): Returns: - tuple: A tuple containing: - - matplotlib.axes._subplots.AxesSubplot: The axes object of the histogram. + - matplotlib.axes._subplots.Axes: The axes object of the histogram. - str: A message indicating whether the distribution is uniform ("The distribution is uniform.") or not ("The distribution is not uniform."). diff --git a/data/processed/f_898_chien_wo_doc.py b/data/processed/f_898_chien_wo_doc.py index d0078dd9..fb91407c 100644 --- a/data/processed/f_898_chien_wo_doc.py +++ b/data/processed/f_898_chien_wo_doc.py @@ -16,12 +16,12 @@ def f_898(file_path): a single numeric value representing an individual in the population. Returns: - - Tuple (float, float, matplotlib.axes._subplots.AxesSubplot): The function returns a tuple containing + - Tuple (float, float, matplotlib.axes._subplots.Axes): The function returns a tuple containing three elements: - Sample mean (float): The mean of the sample. - Sample standard deviation (float): The standard deviation of the sample, calculated with a degrees of freedom (ddof) of 1. - - Matplotlib subplot (matplotlib.axes._subplots.AxesSubplot): An object representing the + - Matplotlib subplot (matplotlib.axes._subplots.Axes): An object representing the generated histogram plot with the normal distribution curve. Requirements: diff --git a/data/processed/f_902_chien_w_doc.py b/data/processed/f_902_chien_w_doc.py index f2116236..0fb30325 100644 --- a/data/processed/f_902_chien_w_doc.py +++ b/data/processed/f_902_chien_w_doc.py @@ -43,7 +43,7 @@ def f_902(num_pairs=10): possible unique combinations (100), it is adjusted to the valid range (1 to 100). Returns: - - ax (matplotlib.axes._subplots.AxesSubplot): The Axes object of the countplot, which can be used for + - ax (matplotlib.axes._subplots.Axes): The Axes object of the countplot, which can be used for further customizations or to retrieve information about the plot. Requirements: diff --git a/data/processed/f_906_chien_w_doc.py b/data/processed/f_906_chien_w_doc.py index 89d1da4f..04983e7c 100644 --- a/data/processed/f_906_chien_w_doc.py +++ b/data/processed/f_906_chien_w_doc.py @@ -14,7 +14,7 @@ def f_906(arr): arr (numpy.ndarray): A 2D numpy array. Returns: - matplotlib.axes._subplots.AxesSubplot: A plot representing the time series of row sums. + matplotlib.axes._subplots.Axes: A plot representing the time series of row sums. Requirements: - pandas @@ -58,7 +58,7 @@ def test_basic_functionality(self): """Test the basic functionality of the function.""" arr = np.array([[i + j for i in range(3)] for j in range(5)]) ax = f_906(arr) - # Check if the function returns AxesSubplot object + # Check if the function returns Axes object self.assertIsInstance(ax, plt.Axes) # Check the title of the plot self.assertEqual(ax.get_title(), "Time Series of Row Sums") @@ -70,7 +70,7 @@ def test_empty_array(self): """Test the function with an empty array.""" arr = np.array([]) ax = f_906(arr) - # Check if the function returns AxesSubplot object + # Check if the function returns Axes object self.assertIsInstance(ax, plt.Axes) # Check the title of the plot self.assertEqual(ax.get_title(), "Time Series of Row Sums") @@ -81,7 +81,7 @@ def test_single_row_array(self): """Test the function with a single row array.""" arr = np.array([[1, 2, 3]]) ax = f_906(arr) - # Check if the function returns AxesSubplot object + # Check if the function returns Axes object self.assertIsInstance(ax, plt.Axes) # Check the title of the plot self.assertEqual(ax.get_title(), "Time Series of Row Sums") @@ -93,7 +93,7 @@ def test_negative_values(self): """Test the function with negative values.""" arr = np.array([[-1, -2, -3], [-4, -5, -6]]) ax = f_906(arr) - # Check if the function returns AxesSubplot object + # Check if the function returns Axes object self.assertIsInstance(ax, plt.Axes) # Check the title of the plot self.assertEqual(ax.get_title(), "Time Series of Row Sums") @@ -105,7 +105,7 @@ def test_zero_values(self): """Test the function with zero values.""" arr = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]]) ax = f_906(arr) - # Check if the function returns AxesSubplot object + # Check if the function returns Axes object self.assertIsInstance(ax, plt.Axes) # Check the title of the plot self.assertEqual(ax.get_title(), "Time Series of Row Sums") diff --git a/data/processed/f_910_chien_w_doc.py b/data/processed/f_910_chien_w_doc.py index cccac8d4..a9f8cc6e 100644 --- a/data/processed/f_910_chien_w_doc.py +++ b/data/processed/f_910_chien_w_doc.py @@ -28,7 +28,7 @@ def f_910(num_samples=NUM_SAMPLES, num_outliers=NUM_OUTLIERS): the artificially introduced outliers. - outliers_detected (numpy array): The outliers detected using the IQR method. This detection is based solely on the normally distributed portion of the data. - - ax (matplotlib.axes._subplots.AxesSubplot): The AxesSubplot object for the histogram + - ax (matplotlib.axes._subplots.Axes): The Axes object for the histogram plot of the combined dataset. Requirements: diff --git a/data/processed/f_915_chien_w_doc.py b/data/processed/f_915_chien_w_doc.py index 13184ed1..f4b2a3e5 100644 --- a/data/processed/f_915_chien_w_doc.py +++ b/data/processed/f_915_chien_w_doc.py @@ -70,7 +70,7 @@ def test_return_types(self): self.assertIsInstance( ax, Axes, - "The second return value should be an instance of matplotlib.axes._subplots.AxesSubplot.", + "The second return value should be an instance of matplotlib.axes._subplots.Axes.", ) def test_number_of_lines(self): """Check that the correct number of lines are plotted.""" diff --git a/data/processed/f_917_chien_wo_doc.py b/data/processed/f_917_chien_wo_doc.py index 26129458..803504c8 100644 --- a/data/processed/f_917_chien_wo_doc.py +++ b/data/processed/f_917_chien_wo_doc.py @@ -13,7 +13,7 @@ def f_917(time_strings, time_format="%d/%m/%Y %H:%M:%S.%f"): The default format is '%d/%m/%Y %H:%M:%S.%f', representing day/month/year hours:minutes:seconds.microseconds. Returns: - - ax (matplotlib.axes._subplots.AxesSubplot or None): An AxesSubplot object with the histogram plotted if + - ax (matplotlib.axes._subplots.Axes or None): An Axes object with the histogram plotted if parsing is successful. Returns None if a parsing error occurs. Requirements: diff --git a/data/processed/f_923_chien_w_doc.py b/data/processed/f_923_chien_w_doc.py index 6e1810ae..8aac3266 100644 --- a/data/processed/f_923_chien_w_doc.py +++ b/data/processed/f_923_chien_w_doc.py @@ -19,7 +19,7 @@ def f_923(data): Returns: - dict: Contains the calculated mean, median, and standard deviation (sample) of the prices. The keys are 'mean', 'median', and 'std_dev'. - - matplotlib.axes._subplots.AxesSubplot: A subplot object that represents the histogram plot of the product prices. + - matplotlib.axes._subplots.Axes: A subplot object that represents the histogram plot of the product prices. The histogram displays the frequency distribution of the prices. Note: diff --git a/data/processed/f_925_chien_w_doc.py b/data/processed/f_925_chien_w_doc.py index 6879f967..4a0dbd45 100644 --- a/data/processed/f_925_chien_w_doc.py +++ b/data/processed/f_925_chien_w_doc.py @@ -24,7 +24,7 @@ def f_925(data=None): } Returns: - - ax (matplotlib.axes._subplots.AxesSubplot): A scatter plot with weight on the x-axis and height on the y-axis, titled "Weight vs Height". + - ax (matplotlib.axes._subplots.Axes): A scatter plot with weight on the x-axis and height on the y-axis, titled "Weight vs Height". Raises: - ValueError: If any of the values in the 'Weight_String' key are not formatted as strings. This validation ensures diff --git a/data/raw/f_746_wenhao.py b/data/raw/f_746_wenhao.py index 6032121c..f4556362 100644 --- a/data/raw/f_746_wenhao.py +++ b/data/raw/f_746_wenhao.py @@ -20,11 +20,11 @@ def f_746(d, keys=['x', 'y', 'z']): >>> data = [{'x': 1, 'y': 10, 'z': 5}, {'x': 3, 'y': 15, 'z': 6}, {'x': 2, 'y': 1, 'z': 7}] >>> ax = f_746(data) >>> type(ax) - + >>> ax = f_746(data, keys=['x', 'y']) >>> type(ax) - + """ # Convert the list of dictionaries to a DataFrame df = pd.DataFrame(d) diff --git a/data/raw/f_752_wenhao.py b/data/raw/f_752_wenhao.py index bf5f2a4b..4fee97f4 100644 --- a/data/raw/f_752_wenhao.py +++ b/data/raw/f_752_wenhao.py @@ -31,7 +31,7 @@ def f_752(letters, repetitions, colors): Example: >>> ax = f_752(['A', 'B', 'C'], [3, 5, 2], ['red', 'green', 'blue']) >>> type(ax) - + """ if len(letters) != len(repetitions) or len(letters) != len(colors) or len(letters) == 0: raise ValueError("All lists must be the same length and non-empty.") diff --git a/data/raw/f_761_wenhao.py b/data/raw/f_761_wenhao.py index dfde8305..7cb4838a 100644 --- a/data/raw/f_761_wenhao.py +++ b/data/raw/f_761_wenhao.py @@ -16,7 +16,7 @@ def f_761(df, column): - column (str): The name of the column in the DataFrame that contains the categories. Output: - - matplotlib.axes._subplots.AxesSubplot: The Axes object for the generated plot. + - matplotlib.axes._subplots.Axes: The Axes object for the generated plot. Requirements: - pandas @@ -30,7 +30,6 @@ def f_761(df, column): >>> df = pd.DataFrame({'Type': ['A', 'A', 'C', 'E', 'D', 'E', 'D']}) >>> ax = f_761(df, 'Type') - # This generates and displays a bar chart showing the distribution of each category within the 'Type' column, including categories with zero occurrences. """ # Define the categories CATEGORIES = ['A', 'B', 'C', 'D', 'E'] diff --git a/data/raw/f_762_wenhao.py b/data/raw/f_762_wenhao.py index 16a1a8e8..61d6fd36 100644 --- a/data/raw/f_762_wenhao.py +++ b/data/raw/f_762_wenhao.py @@ -11,7 +11,7 @@ def f_762(df): df (pandas.DataFrame): The DataFrame containing numerical columns to be used for correlation. Returns: - matplotlib.axes._subplots.AxesSubplot: The matplotlib Axes object representing the heatmap. + matplotlib.axes._subplots.Axes: The matplotlib Axes object representing the heatmap. Requirements: - pandas @@ -22,7 +22,7 @@ def f_762(df): >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) >>> ax = f_762(df) >>> type(ax) - + """ correlation_matrix = df.corr() diff --git a/data/raw/f_765_wenhao.py b/data/raw/f_765_wenhao.py index 22757d9f..a0345ab3 100644 --- a/data/raw/f_765_wenhao.py +++ b/data/raw/f_765_wenhao.py @@ -24,10 +24,11 @@ def f_765(person_names, email_domains, num_records=5): - ValueError: If the number of names provided is less than the number of records requested or if no email domains are provided. Example: + >>> random.seed(0) # Initialize random seed >>> f_765(['John Doe', 'Jane Smith'], ['gmail.com', 'yahoo.com'], 2) Name Email - 0 John Doe john[at]yahoo.com - 1 Jane Smith jane[at]gmail.com + 0 Jane Smith jane[at]gmail.com + 1 John Doe john[at]yahoo.com >>> f_765(['Alice'], ['outlook.com'], 1) Name Email 0 Alice alice[at]outlook.com diff --git a/data/raw/f_770_wenhao.py b/data/raw/f_770_wenhao.py index 17a0b58b..79426deb 100644 --- a/data/raw/f_770_wenhao.py +++ b/data/raw/f_770_wenhao.py @@ -26,8 +26,8 @@ def f_770(word: str) -> dict: - The function uses the `string` library to get a string of lowercase alphabets. Example: - >>> f_770('abcdef') - {'ab': 1, 'ac': 0, 'ad': 0, ..., 'yx': 0, 'yz': 0, 'za': 0, ..., 'zx': 0, 'zy': 0} + >>> list(f_770('abcdef').items())[:5] + [('ab', 1), ('ac', 0), ('ad', 0), ('ae', 0), ('af', 0)] """ ALPHABETS = string.ascii_lowercase # Generate all two-letter combinations of alphabets diff --git a/script/parse.py b/script/parse.py index b49980ef..df740d76 100644 --- a/script/parse.py +++ b/script/parse.py @@ -168,7 +168,7 @@ def extract_content(file_path): data["task_id"] = function_name break with open(file_path, "r", encoding="utf-8") as f: - content = f.read().strip("\n").replace("", "") + content = f.read().strip("\n").replace("AxesSubplot", "Axes") # Extracting the docstring if present docstring_start = content.find('"""')