diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3574e8a7d..b7d87e6fc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -31,6 +31,10 @@ v34.9.4 (unreleased) The labels are now always presented in alphabetical order for consistency. https://github.com/aboutcode-org/scancode.io/issues/1520 +- Add a ``batch-create`` management command that allows to create multiple projects + at once from a directory containing input files. + https://github.com/aboutcode-org/scancode.io/issues/1437 + v34.9.3 (2024-12-31) -------------------- diff --git a/docs/command-line-interface.rst b/docs/command-line-interface.rst index dd2bb6756..ef16047a8 100644 --- a/docs/command-line-interface.rst +++ b/docs/command-line-interface.rst @@ -57,6 +57,7 @@ ScanPipe's own commands are listed under the ``[scanpipe]`` section:: add-input add-pipeline archive-project + batch-create check-compliance create-project create-user @@ -83,7 +84,8 @@ For example:: $ scanpipe create-project --help usage: scanpipe create-project [--input-file INPUTS_FILES] [--input-url INPUT_URLS] [--copy-codebase SOURCE_DIRECTORY] - [--pipeline PIPELINES] [--execute] [--async] + [--pipeline PIPELINES] [--label LABELS] [--notes NOTES] + [--execute] [--async] name Create a ScanPipe project. @@ -124,6 +126,10 @@ Optional arguments: - ``--copy-codebase SOURCE_DIRECTORY`` Copy the content of the provided source directory into the :guilabel:`codebase/` work directory. +- ``--notes NOTES`` Optional notes about the project. + +- ``--label LABELS`` Optional labels for the project. + - ``--execute`` Execute the pipelines right after project creation. - ``--async`` Add the pipeline run to the tasks queue for execution by a worker instead @@ -133,6 +139,90 @@ Optional arguments: .. warning:: Pipelines are added and are executed in order. +.. _cli_batch_create: + +`$ scanpipe batch-create [--input-directory INPUT_DIRECTORY] [--input-list FILENAME.csv]` +----------------------------------------------------------------------------------------- + +Processes files from the specified ``INPUT_DIRECTORY`` or rows from ``FILENAME.csv``, +creating a project for each file or row. + +- Use ``--input-directory`` to specify a local directory. Each file in the directory + will result in a project, uniquely named using the filename and a timestamp. + +- Use ``--input-list`` to specify a ``FILENAME.csv``. Each row in the CSV will be used + to create a project based on the data provided. + +Supports specifying pipelines and asynchronous execution. + +Required arguments (one of): + +- ``input-directory`` The path to the directory containing the input files to process. + Ensure the directory exists and contains the files you want to use. + +- ``input-list`` Path to a CSV file with project names and input URLs. + The first column must contain project names, and the second column should list + comma-separated input URLs (e.g., Download URL, PURL, or Docker reference). + + **CSV content example**: + + +----------------+---------------------------------+ + | project_name | input_urls | + +================+=================================+ + | project-1 | https://url.com/file.ext | + +----------------+---------------------------------+ + | project-2 | pkg:deb/debian/curl@7.50.3 | + +----------------+---------------------------------+ + +Optional arguments: + +- ``--project-name-suffix`` Optional custom suffix to append to project names. + If not provided, a timestamp (in the format [YYMMDD_HHMMSS]) will be used. + +- ``--pipeline PIPELINES`` Pipelines names to add on the project. + +- ``--notes NOTES`` Optional notes about the project. + +- ``--label LABELS`` Optional labels for the project. + +- ``--execute`` Execute the pipelines right after project creation. + +- ``--async`` Add the pipeline run to the tasks queue for execution by a worker instead + of running in the current thread. + Applies only when ``--execute`` is provided. + +Example: Processing Multiple Docker Images +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assume multiple Docker images are available in a directory named ``local-data/`` on +the host machine. +To process these images with the ``analyze_docker_image`` pipeline using asynchronous +execution:: + + $ docker compose run --rm \ + --volume local-data/:/input-data:ro \ + web scanpipe batch-create input-data/ \ + --pipeline analyze_docker_image \ + --label "Docker" \ + --execute --async + +**Explanation**: + +- ``local-data/``: A directory on the host machine containing the Docker images to + process. +- ``/input-data/``: The directory inside the container where ``local-data/`` is + mounted (read-only). +- ``--pipeline analyze_docker_image``: Specifies the ``analyze_docker_image`` + pipeline for processing each Docker image. +- ``--label "Docker"``: Tagging all the projects with the "Docker" label to enable + easy search and filtering. +- ``--execute``: Runs the pipeline immediately after creating a project for each + image. +- ``--async``: Adds the pipeline run to the worker queue for asynchronous execution. + +Each Docker image in the ``local-data/`` directory will result in the creation of a +project with the specified pipeline (``analyze_docker_image``) executed by worker +services. `$ scanpipe list-pipeline [--verbosity {0,1,2,3}]` -------------------------------------------------- diff --git a/docs/faq.rst b/docs/faq.rst index aea2fc65d..a7093b316 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -108,6 +108,33 @@ It does not compute such summary. You can also have a look at the different steps for each pipeline from the :ref:`built_in_pipelines` documentation. +How to create multiple projects at once? +----------------------------------------- + +You can use the :ref:`cli_batch_create` command to create multiple projects +simultaneously. +This command processes all files in a specified input directory, creating one project +per file. +Each project is uniquely named using the file name and a timestamp by default. + +For example, to create multiple projects from files in a directory named +``local-data/``:: + + $ docker compose run --rm \ + --volume local-data/:/input-data:ro \ + web scanpipe batch-create input-data/ + +**Options**: + +- **Custom Pipelines**: Use the ``--pipeline`` option to add specific pipelines to the + projects. +- **Asynchronous Execution**: Add ``--execute`` and ``--async`` to queue pipeline + execution for worker processing. +- **Project Notes and Labels**: Use ``--notes`` and ``--label`` to include metadata. + +Each file in the input directory will result in the creation of a corresponding project, +ready for pipeline execution. + Can I run multiple pipelines in parallel? ----------------------------------------- @@ -279,7 +306,7 @@ data older than 7 days:: See :ref:`command_line_interface` chapter for more information about the scanpipe command. -How can I provide my license policies ? +How can I provide my license policies? --------------------------------------- For detailed information about the policies system, refer to :ref:`policies`. diff --git a/scanpipe/management/commands/__init__.py b/scanpipe/management/commands/__init__.py index 05dcbbb9a..470729ba7 100644 --- a/scanpipe/management/commands/__init__.py +++ b/scanpipe/management/commands/__init__.py @@ -150,6 +150,28 @@ def display_status(self, project, verbosity): self.stdout.write(line) +class PipelineCommandMixin: + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + "--pipeline", + action="append", + dest="pipelines", + default=list(), + help=( + "Pipelines names to add to the project. " + "The pipelines are added and executed based on their given order. " + 'Groups can be provided using the "pipeline_name:option1,option2" ' + "syntax." + ), + ) + parser.add_argument( + "--execute", + action="store_true", + help="Execute the pipelines right after the project creation.", + ) + + class AddInputCommandMixin: def add_arguments(self, parser): super().add_arguments(parser) @@ -427,6 +449,7 @@ def create_project( input_urls=None, copy_from="", notes="", + labels=None, execute=False, run_async=False, command=None, @@ -451,6 +474,10 @@ def create_project( ) project.save() + + if labels: + project.labels.add(*labels) + if command: command.project = project @@ -491,6 +518,20 @@ def execute_project(self, run_async=False): class CreateProjectCommandMixin(ExecuteProjectCommandMixin): + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + "--notes", + help="Optional notes about the project.", + ) + parser.add_argument( + "--label", + action="append", + dest="labels", + default=list(), + help="Optional labels for the project.", + ) + def create_project( self, name, @@ -499,9 +540,13 @@ def create_project( input_urls=None, copy_from="", notes="", + labels=None, execute=False, run_async=False, ): + if execute and not pipelines: + raise CommandError("The --execute option requires one or more pipelines.") + return create_project( name=name, pipelines=pipelines, @@ -509,6 +554,7 @@ def create_project( input_urls=input_urls, copy_from=copy_from, notes=notes, + labels=labels, execute=execute, run_async=run_async, command=self, diff --git a/scanpipe/management/commands/batch-create.py b/scanpipe/management/commands/batch-create.py new file mode 100644 index 000000000..955e0bffc --- /dev/null +++ b/scanpipe/management/commands/batch-create.py @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import csv +from datetime import datetime +from pathlib import Path + +from django.core.management import CommandError +from django.core.management.base import BaseCommand + +from scanpipe.management.commands import CreateProjectCommandMixin +from scanpipe.management.commands import PipelineCommandMixin + + +class Command(CreateProjectCommandMixin, PipelineCommandMixin, BaseCommand): + help = ( + "Processes files in the specified input directory by creating a project " + "for each file. Each project is uniquely named using the filename and a " + "timestamp. Supports specifying pipelines and asynchronous execution." + ) + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + "--input-directory", + help=( + "The path to the directory containing the input files to process. " + "Ensure the directory exists and contains the files you want to use." + ), + ) + parser.add_argument( + "--input-list", + metavar="FILENAME.csv", + help=( + "Path to a CSV file with project names and input URLs. " + "The first column must contain project names, and the second column " + "should list comma-separated input URLs (e.g., Download URL, PURL, or " + "Docker reference)." + ), + ) + parser.add_argument( + "--project-name-suffix", + help=( + "Optional custom suffix to append to project names. If not provided, " + "a timestamp (in the format [YYMMDD_HHMMSS]) will be used." + ), + ) + + def handle(self, *args, **options): + self.verbosity = options["verbosity"] + + input_directory = options["input_directory"] + input_list = options["input_list"] + + if not (input_directory or input_list): + raise CommandError( + "You must provide either --input-directory or --input-list as input." + ) + + if input_directory: + self.handle_input_directory(**options) + + if input_list: + self.handle_input_list(**options) + + def handle_input_directory(self, **options): + timestamp = datetime.now().strftime("%y%m%d_%H%M%S") + project_name_suffix = options.get("project_name_suffix") or timestamp + + directory = Path(options["input_directory"]) + if not directory.exists(): + raise CommandError("The directory does not exist.") + + for file_path in directory.rglob("*"): + if file_path.is_file(): + project_name = f"{file_path.name} {project_name_suffix}" + self.create_project( + name=project_name, + pipelines=options["pipelines"], + input_files=[str(file_path)], + notes=options["notes"], + labels=options["labels"], + execute=options["execute"], + run_async=options["async"], + ) + + def handle_input_list(self, **options): + input_file = Path(options["input_list"]) + if not input_file.exists(): + raise CommandError(f"The {input_file} file does not exist.") + + timestamp = datetime.now().strftime("%y%m%d_%H%M%S") + project_name_suffix = options.get("project_name_suffix") or timestamp + + project_list = process_csv(input_file) + for project_data in project_list: + project_name = project_data["project_name"] + project_name = f"{project_name} {project_name_suffix}" + input_urls = project_data["input_urls"].split(",") + self.create_project( + name=project_name, + pipelines=options["pipelines"], + input_urls=input_urls, + notes=options["notes"], + labels=options["labels"], + execute=options["execute"], + run_async=options["async"], + ) + + +def process_csv(file_path): + required_headers = {"project_name", "input_urls"} + + with open(file_path, newline="", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile) + + # Validate headers + if not required_headers.issubset(reader.fieldnames): + raise ValueError( + f"The CSV file must contain the headers: {', '.join(required_headers)}" + ) + + project_list = [ + {"project_name": row["project_name"], "input_urls": row["input_urls"]} + for row in reader + ] + return project_list diff --git a/scanpipe/management/commands/create-project.py b/scanpipe/management/commands/create-project.py index 2a1e333e6..4b79ddc2e 100644 --- a/scanpipe/management/commands/create-project.py +++ b/scanpipe/management/commands/create-project.py @@ -20,41 +20,22 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. -from django.core.management import CommandError from django.core.management.base import BaseCommand from scanpipe.management.commands import AddInputCommandMixin from scanpipe.management.commands import CreateProjectCommandMixin +from scanpipe.management.commands import PipelineCommandMixin -class Command(CreateProjectCommandMixin, AddInputCommandMixin, BaseCommand): +class Command( + CreateProjectCommandMixin, AddInputCommandMixin, PipelineCommandMixin, BaseCommand +): help = "Create a ScanPipe project." verbosity = 1 def add_arguments(self, parser): super().add_arguments(parser) parser.add_argument("name", help="Project name.") - parser.add_argument( - "--pipeline", - action="append", - dest="pipelines", - default=list(), - help=( - "Pipelines names to add to the project. " - "The pipelines are added and executed based on their given order. " - 'Groups can be provided using the "pipeline_name:option1,option2" ' - "syntax." - ), - ) - parser.add_argument( - "--execute", - action="store_true", - help="Execute the pipelines right after the project creation.", - ) - parser.add_argument( - "--notes", - help="Optional notes about the project.", - ) def handle(self, *args, **options): self.verbosity = options["verbosity"] @@ -64,12 +45,10 @@ def handle(self, *args, **options): input_urls = options["input_urls"] copy_from = options["copy_codebase"] notes = options["notes"] + labels = options["labels"] execute = options["execute"] run_async = options["async"] - if execute and not pipelines: - raise CommandError("The --execute option requires one or more pipelines.") - self.create_project( name=name, pipelines=pipelines, @@ -77,6 +56,7 @@ def handle(self, *args, **options): input_urls=input_urls, copy_from=copy_from, notes=notes, + labels=labels, execute=execute, run_async=run_async, ) diff --git a/scanpipe/tests/data/commands/batch-create-directory/a.txt b/scanpipe/tests/data/commands/batch-create-directory/a.txt new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/commands/batch-create-directory/b.txt b/scanpipe/tests/data/commands/batch-create-directory/b.txt new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/commands/batch-create-list/project_list.csv b/scanpipe/tests/data/commands/batch-create-list/project_list.csv new file mode 100644 index 000000000..7b4bb11ee --- /dev/null +++ b/scanpipe/tests/data/commands/batch-create-list/project_list.csv @@ -0,0 +1,3 @@ +project_name,input_urls, +project-v1,"https://example.com/source.zip#from,https://example.com/binary.bin#to", +project-v2,https://example.com/filename.zip, \ No newline at end of file diff --git a/scanpipe/tests/test_commands.py b/scanpipe/tests/test_commands.py index 65689ddb7..f57303d91 100644 --- a/scanpipe/tests/test_commands.py +++ b/scanpipe/tests/test_commands.py @@ -93,6 +93,15 @@ def test_scanpipe_management_command_create_project_verbosity(self): self.assertEqual("", out.getvalue()) self.assertTrue(Project.objects.get(name="my_project")) + def test_scanpipe_management_command_create_project_labels(self): + out = StringIO() + options = ["--label", "label1", "--label", "label2"] + + call_command("create-project", "my_project", *options, stdout=out) + self.assertIn("Project my_project created", out.getvalue()) + project = Project.objects.get(name="my_project") + self.assertEqual(["label1", "label2"], list(project.labels.names())) + def test_scanpipe_management_command_create_project_notes(self): out = StringIO() notes = "Some notes about my project" @@ -185,6 +194,89 @@ def test_scanpipe_management_command_create_project_execute(self): "Project other_project created with work directory", out.getvalue() ) + def test_scanpipe_management_command_batch_create(self): + expected = "You must provide either --input-directory or --input-list as input." + with self.assertRaisesMessage(CommandError, expected): + call_command("batch-create") + + input_directory = self.data / "commands" / "batch-create-directory" + options = [ + "--input-directory", + str(input_directory), + "--pipeline", + "scan_package", + "--note", + "Some notes", + "--label", + "label1", + "--label", + "label2", + "--project-name-suffix", + "suffix", + ] + + out = StringIO() + call_command("batch-create", *options, stdout=out) + self.assertIn("Project a.txt suffix created", out.getvalue()) + self.assertIn("Project b.txt suffix created", out.getvalue()) + + self.assertEqual(2, Project.objects.count()) + project = Project.objects.get(name="a.txt suffix") + self.assertEqual("Some notes", project.notes) + self.assertEqual(["label1", "label2"], list(project.labels.names())) + self.assertEqual("scan_single_package", project.runs.get().pipeline_name) + self.assertEqual(["a.txt"], project.input_files) + + @mock.patch("requests.sessions.Session.get") + def test_scanpipe_management_command_batch_create_input_list_csv(self, mock_get): + mock_responses = [ + mock.Mock( + content=b"\x00", + headers={}, + status_code=200, + url="https://example.com/source.zip", + ), + mock.Mock( + content=b"\x00", + headers={}, + status_code=200, + url="https://example.com/binary.bin", + ), + mock.Mock( + content=b"\x00", + headers={}, + status_code=200, + url="https://example.com/filename.zip", + ), + ] + mock_get.side_effect = mock_responses + + input_list = self.data / "commands" / "batch-create-list" / "project_list.csv" + options = [ + "--input-list", + str(input_list), + "--pipeline", + "map_deploy_to_develop", + ] + + out = StringIO() + call_command("batch-create", *options, stdout=out) + self.assertIn("Project project-v1", out.getvalue()) + self.assertIn("Project project-v2", out.getvalue()) + + self.assertEqual(2, Project.objects.count()) + project1 = Project.objects.filter(name__contains="project-v1")[0] + self.assertEqual("map_deploy_to_develop", project1.runs.get().pipeline_name) + self.assertEqual(["binary.bin", "source.zip"], sorted(project1.input_files)) + input_source = project1.inputsources.get(filename="source.zip") + self.assertEqual("from", input_source.tag) + input_source = project1.inputsources.get(filename="binary.bin") + self.assertEqual("to", input_source.tag) + + project2 = Project.objects.filter(name__contains="project-v2")[0] + self.assertEqual("map_deploy_to_develop", project1.runs.get().pipeline_name) + self.assertEqual(["filename.zip"], sorted(project2.input_files)) + def test_scanpipe_management_command_add_input_file(self): out = StringIO() @@ -1088,7 +1180,7 @@ def test_scanpipe_management_command_mixin_create_project_inputs(self): self.assertEqual("tag", tagged_source.tag) def test_scanpipe_management_command_mixin_create_project_execute(self): - expected = "The execute argument requires one or more pipelines." + expected = "The --execute option requires one or more pipelines." with self.assertRaisesMessage(CommandError, expected): self.create_project_command.create_project(name="my_project", execute=True)