From 9866f54c319c1b90f2a1456cc8fb7c643847fc5d Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Fri, 21 Feb 2025 10:03:07 -0800 Subject: [PATCH] Update wiki dump date and few minor updates in single gpu tutorial Signed-off-by: Ayush Dattagupta --- .../single_node_tutorial/single_gpu_tutorial.ipynb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index c7f40053..18861820 100644 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -306,7 +306,7 @@ "source": [ "res = download_wikipedia(download_output_directory,\n", " language=language, \n", - " url_limit=url_limit).df.compute()" + " url_limit=url_limit, dump_date=\"20250201\").df.compute()" ] }, { @@ -1094,7 +1094,8 @@ "\n", "!mkdir -p {fuzzy_dedup_base_output_path}\n", "!mkdir -p {fuzzy_dedup_log_dir}\n", - "!mkdir -p {fuzzy_dedup_cache_dir}\n", + "!mkdir -p {fuzzy_dedup_no_false_positive_cache_dir}\n", + "!mkdir -p {fuzzy_dedup_false_positive_cache_dir}\n", "!mkdir -p {fuzzy_dedup_output_dir}" ] }, @@ -1570,7 +1571,7 @@ "input_id_field = 'id'\n", "\n", "\n", - "!mkdir -p {edgelist_output_path}\n", + "!mkdir -p {edgelist_output_dir}\n", "!mkdir -p {buckets_to_edges_log_path}" ] }, @@ -1675,11 +1676,13 @@ "connected_component_base_output_path = os.path.join(data_dir,\"fuzzy/cc\")\n", "connected_component_output_path = os.path.join(connected_component_base_output_path, \"connected_components.parquet\")\n", "connected_component_cache_dir = os.path.join(connected_component_base_output_path, \"cache\")\n", + "connected_component_log_path = os.path.join(connected_component_base_output_path,\"log\")\n", "\n", "#Relevant parameters\n", "input_id_field = 'id'\n", "\n", - "!mkdir -p {connected_component_base_output_path}" + "!mkdir -p {connected_component_base_output_path}\n", + "!mkdir -p {connected_component_log_path}" ] }, { @@ -1705,6 +1708,7 @@ " cache_dir=connected_component_cache_dir,\n", " jaccard_pairs_path=jaccard_pairs_path,\n", " id_column=input_id_field,\n", + " logger=connected_component_log_path,\n", ")\n", "\n", "#Load and run connected component\n",