diff --git a/README.md b/README.md index 0e68a96..970566a 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ prov { } ``` -Finally, run your Nextflow pipeline. You do not need to modify your pipeline script in order to use the `nf-prov` plugin. The plugin will automatically generate a JSON file with provenance information. +Finally, run your Nextflow pipeline. You do not need to modify your pipeline script in order to use the `nf-prov` plugin. The plugin will automatically produce the specified provenance reports at the end of the workflow run. ## Configuration @@ -44,14 +44,16 @@ Create the provenance report (default: `true` if plugin is loaded). Configuration scope for the desired output formats. The following formats are available: -- `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. - - *New in version 1.3.0*: additional "pass-through" options are available for BCO fields that can't be inferred from the pipeline. See [BCO.md](./BCO.md) for more information. +- `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. See [BCO.md](./BCO.md) for more information about the additional config options for BCO. - `dag`: Render the task graph as a Mermaid diagram embedded in an HTML document. Supports the `file` and `overwrite` options. - `legacy`: Render the legacy format originally defined in this plugin (default). Supports the `file` and `overwrite` options. +*New in version 1.4.0* + +- `wrroc`: Render a [Workflow Run RO-Crate](https://www.researchobject.org/workflow-run-crate/). Includes all three profiles (Process, Workflow, and Provenance). See [WRROC.md](./WRROC.md) for more information about the additional config options for WRROC. + Any number of formats can be specified, for example: ```groovy @@ -69,6 +71,8 @@ prov { } ``` +See [nextflow.config](./nextflow.config) for a full example of each provenance format. + `prov.patterns` List of file patterns to include in the provenance report, from the set of published files. By default, all published files are included. @@ -114,16 +118,16 @@ Following these step to package, upload and publish the plugin: 2. Update the `Plugin-Version` field in the following file with the release version: - ```bash - plugins/nf-prov/src/resources/META-INF/MANIFEST.MF - ``` + ```bash + plugins/nf-prov/src/resources/META-INF/MANIFEST.MF + ``` 3. Run the following command to package and upload the plugin in the GitHub project releases page: - ```bash - ./gradlew :plugins:nf-prov:upload - ``` + ```bash + ./gradlew :plugins:nf-prov:upload + ``` -4. Create a pull request against the [nextflow-io/plugins](https://github.com/nextflow-io/plugins/blob/main/plugins.json) - project to make the plugin public accessible to Nextflow app. +4. Create a pull request against the [nextflow-io/plugins](https://github.com/nextflow-io/plugins/blob/main/plugins.json) + project to make the plugin public accessible to Nextflow app. diff --git a/WRROC.md b/WRROC.md new file mode 100644 index 0000000..cc38963 --- /dev/null +++ b/WRROC.md @@ -0,0 +1,45 @@ +# Additional WRROC configuration + +*New in version 1.4.0* + +The `wrroc` format supports additional options to configure certain aspects of the Workflow Run RO-Crate. These fields cannot be inferred automatically from the pipeline or the run, and so must be entered through the config. + +The following config options are supported: + +- `prov.formats.wrroc.agent.contactType` +- `prov.formats.wrroc.agent.email` +- `prov.formats.wrroc.agent.name` +- `prov.formats.wrroc.agent.orcid` +- `prov.formats.wrroc.agent.phone` +- `prov.formats.wrroc.agent.ror` +- `prov.formats.wrroc.organization.contactType` +- `prov.formats.wrroc.organization.email` +- `prov.formats.wrroc.organization.name` +- `prov.formats.wrroc.organization.phone` +- `prov.formats.wrroc.organization.ror` +- `prov.formats.wrroc.publisher` + +Refer to the [WRROC User Guide](https://www.researchobject.org/workflow-run-crate/) for more information about the associated RO-Crate entities. + +Here is an example config: + +```groovy +prov { + formats { + wrroc { + agent { + name = "John Doe" + orcid = "https://orcid.org/0000-0000-0000-0000" + email = "john.doe@example.org" + phone = "(0)89-99998 000" + contactType = "Researcher" + } + organization { + name = "University of XYZ" + ror = "https://ror.org/000000000" + } + publisher = "https://ror.org/000000000" + } + } +} +``` diff --git a/nextflow.config b/nextflow.config index 6219b1b..82b9e4d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,5 +20,13 @@ prov { file = "${params.outdir}/manifest.json" overwrite = true } + wrroc { + file = "${params.outdir}/ro-crate-metadata.json" + overwrite = true + } } } + +manifest { + license = "https://spdx.org/licenses/Apache-2.0" +} diff --git a/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy b/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy index f0dc26f..5d57dac 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy @@ -32,8 +32,6 @@ class PathNormalizer { private String commitId - private String launchDir - private String projectDir private String workDir @@ -42,14 +40,12 @@ class PathNormalizer { repository = metadata.repository ? new URL(metadata.repository) : null commitId = metadata.commitId projectDir = metadata.projectDir.toUriString() - launchDir = metadata.launchDir.toUriString() workDir = metadata.workDir.toUriString() } /** - * Normalize paths so that local absolute paths become - * relative paths, and local paths derived from remote URLs - * become the URLs. + * Normalize paths against the original remote URL, or + * work directory, where appropriate. * * @param path */ @@ -66,9 +62,9 @@ class PathNormalizer { if( repository && path.startsWith(projectDir) ) return getProjectSourceUrl(path) - // replace launch directory with relative path - if( path.startsWith(launchDir) ) - return path.replace(launchDir + '/', '') + // encode local absolute paths as file URLs + if( path.startsWith('/') ) + return 'file://' + path return path } diff --git a/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy b/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy index 0a3ceba..cbd4fde 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy @@ -19,6 +19,7 @@ package nextflow.prov import java.nio.file.Path import groovy.transform.CompileStatic +import nextflow.Session import nextflow.exception.AbortOperationException import nextflow.file.FileHelper import nextflow.processor.TaskRun @@ -49,6 +50,25 @@ class ProvHelper { } } + /** + * Get the remote file staging directory for a workflow run. + * + * @param session + */ + static Path getStageDir(Session session) { + return session.workDir.resolve("stage-${session.uniqueId}") + } + + /** + * Determine whether a task input file was staged into the work directory. + * + * @param source + * @param session + */ + static boolean isStagedInput(Path source, Session session) { + return source.startsWith(getStageDir(session)) + } + /** * Get the list of output files for a task. * diff --git a/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy b/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy index f508959..7dca105 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy @@ -40,7 +40,7 @@ import nextflow.trace.TraceRecord @CompileStatic class ProvObserver implements TraceObserver { - public static final List VALID_FORMATS = ['bco', 'dag', 'legacy'] + public static final List VALID_FORMATS = ['bco', 'dag', 'legacy', 'wrroc'] private Session session @@ -71,6 +71,9 @@ class ProvObserver implements TraceObserver { if( name == 'legacy' ) return new LegacyRenderer(opts) + if( name == 'wrroc' ) + return new WrrocRenderer(opts) + throw new IllegalArgumentException("Invalid provenance format -- valid formats are ${VALID_FORMATS.join(', ')}") } diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy new file mode 100644 index 0000000..46f1f65 --- /dev/null +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -0,0 +1,947 @@ +/* + * Copyright 2023, Seqera Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nextflow.prov + +import java.nio.file.FileSystems +import java.nio.file.Files +import java.nio.file.Path +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter + +import groovy.json.JsonOutput +import groovy.json.JsonSlurper +import groovy.transform.CompileStatic +import groovy.util.logging.Slf4j +import nextflow.Session +import nextflow.processor.TaskProcessor +import nextflow.processor.TaskRun +import nextflow.script.ProcessDef +import nextflow.script.ScriptMeta +import nextflow.util.ConfigHelper +import org.yaml.snakeyaml.Yaml + +/** + * Renderer for the Provenance Run RO Crate format. + * + * @author Ben Sherman + * @author Felix Bartusch + * @author Famke Bäuerle + */ +@Slf4j +@CompileStatic +class WrrocRenderer implements Renderer { + + private static final List README_FILENAMES = List.of("README.md", "README.txt", "readme.md", "readme.txt", "Readme.md", "Readme.txt", "README") + + private Path path + + private boolean overwrite + + @Delegate + private PathNormalizer normalizer + + // List of contact points (people, organizations) to be added + private List contactPoints = [] + + WrrocRenderer(Map opts) { + path = (opts.file as Path).complete() + overwrite = opts.overwrite as Boolean + + ProvHelper.checkFileOverwrite(path, overwrite) + } + + @Override + void render(Session session, Set tasks, Map workflowOutputs) { + // get workflow inputs + final taskLookup = ProvHelper.getTaskLookup(tasks) + final workflowInputs = ProvHelper.getWorkflowInputs(tasks, taskLookup) + + // get workflow metadata + final metadata = session.workflowMetadata + final crateDir = path.getParent() + final projectDir = metadata.projectDir + this.normalizer = new PathNormalizer(metadata) + + final manifest = metadata.manifest + final scriptFile = metadata.getScriptFile() + + final formatter = DateTimeFormatter.ISO_OFFSET_DATE_TIME + final dateStarted = formatter.format(metadata.start) + final dateCompleted = formatter.format(metadata.complete) + final nextflowVersion = metadata.nextflow.version.toString() + final params = session.params + + // parse wrroc configuration + final wrrocOpts = session.config.navigate('prov.formats.wrroc', [:]) as Map + final agent = getAgentInfo(wrrocOpts) + final organization = getOrganizationInfo(wrrocOpts) + final publisherId = getPublisherId(wrrocOpts, agent, organization) + if( organization ) + agent["affiliation"] = ["@id": organization["@id"]] + + // create manifest + final datasetParts = [] + + // -- license + if( manifest.license ) { + datasetParts.add([ + "@id" : manifest.license, + "@type": "CreativeWork" + ]) + } + + // -- readme file + for( final fileName : README_FILENAMES ) { + final readmePath = projectDir.resolve(fileName) + if( !Files.exists(readmePath) ) + continue + + readmePath.copyTo(crateDir) + datasetParts.add([ + "@id" : fileName, + "@type" : "File", + "name" : fileName, + "description" : "The README file of the workflow.", + "encodingFormat": getEncodingFormat(readmePath) ?: "text/plain" + ]) + break + } + + // -- main script + final mainScriptId = metadata.scriptFile.name + final softwareApplicationId = "${mainScriptId}#software-application" + final organizeActionId = "${mainScriptId}#organize" + metadata.scriptFile.copyTo(crateDir) + + if( !metadata.repository ) + log.warn "Could not determine pipeline repository URL for Workflow Run RO-Crate -- launch the pipeline with canonical URL (e.g. `nextflow run nextflow-io/hello`) to ensure that the pipeline repository URL is recorded in the crate" + + // -- parameter schema + final schemaPath = scriptFile.getParent().resolve("nextflow_schema.json") + Map paramSchema = [:] + if( Files.exists(schemaPath) ) { + final fileName = schemaPath.name + + schemaPath.copyTo(crateDir) + datasetParts.add([ + "@id" : fileName, + "@type" : "File", + "name" : fileName, + "description" : "The parameter schema of the workflow.", + "encodingFormat": "application/json" + ]) + paramSchema = getParameterSchema(schemaPath) + } + + // -- resolved config + final configPath = crateDir.resolve("nextflow.config") + configPath.text = ConfigHelper.toCanonicalString(session.config, true) + + datasetParts.add([ + "@id" : "nextflow.config", + "@type" : "File", + "name" : "Resolved Nextflow configuration", + "description" : "The resolved Nextflow configuration for the workflow run.", + "encodingFormat": "text/plain" + ]) + + // -- pipeline parameters + // TODO: formal parameters for workflow output targets + final formalParameters = params + .collect { name, value -> + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + final encoding = type == "File" + ? getEncodingFormat(value as Path) + : null + + if( !type ) + log.warn "Could not determine type of parameter `${name}` for Workflow Run RO-Crate" + + return withoutNulls([ + "@id" : getFormalParameterId(name), + "@type" : "FormalParameter", + "additionalType": type, + "conformsTo" : ["@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE"], + "encodingFormat": encoding, + "name" : name, + "defaultValue" : schema.default, + "description" : schema.description, + ]) + } + + final propertyValues = params + .findAll { name, value -> value != null } + .collect { name, value -> + final paramId = getFormalParameterId(name) + final normalized = + (value instanceof List || value instanceof Map) ? JsonOutput.toJson(value) + : value instanceof CharSequence ? normalizePath(value.toString()) + : value + + return [ + "@id" : "${paramId}/value", + "@type" : "PropertyValue", + "exampleOfWork": ["@id": paramId], + "name" : name, + "value" : normalized + ] + } + + // -- input files + Map paramInputFiles = [:] + + params.each { name, value -> + if( !value ) + return + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + if( type != "File" ) + return + final source = (value as Path).complete() + // don't try to download remote files + if( source.fileSystem != FileSystems.default ) + return + // don't try to copy local directories + if( !source.isFile() ) + return + paramInputFiles.put(source, name) + } + + final inputFiles = workflowInputs + .findAll { source -> !ProvHelper.isStagedInput(source, session) } + .collect { source -> + final paramName = paramInputFiles[source] + if( paramName ) { + log.debug "Copying input file specified by `params.${paramName}` into RO-Crate: ${source.toUriString()}" + source.copyTo(crateDir) + } + + withoutNulls([ + "@id" : paramName ? source.name : normalizePath(source), + "@type" : getType(source), + "name" : source.name, + "encodingFormat": getEncodingFormat(source), + ]) + } + + // -- copy local input files specified by params to crate + params.each { name, value -> + if( !value ) + return + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + if( type == "File" ) { + final source = (value as Path).complete() + // don't try to download remote files + if( source.fileSystem != FileSystems.default ) + return + // don't try to copy local directories + if( !source.isFile() ) + return + inputFiles.add(withoutNulls([ + "@id" : source.name, + "@type" : type, + "description" : "Input file specified by params.${name}", + "encodingFormat": getEncodingFormat(source) + ])) + log.debug "Copying input file specified by params.${name} into RO-Crate: ${source.toUriString()}" + source.copyTo(crateDir) + } + } + + // -- output files + final outputFiles = workflowOutputs + .findAll { source, target -> + // warn about any output files outside of the crate directory + final result = target.startsWith(crateDir) + if( !result ) + log.warn "Excluding workflow output ${target} because it is outside of the RO-Crate directory -- make sure that the workflow output directory and RO-Crate directory are the same" + return result + } + .collect { source, target -> + withoutNulls([ + "@id" : crateDir.relativize(target).toString(), + "@type" : getType(target), + "name" : target.name, + "encodingFormat": getEncodingFormat(target), + ]) + } + + // -- workflow definition + final taskProcessors = tasks + .collect { task -> task.processor } + .unique() + + final processDefs = taskProcessors + .collect { process -> ScriptMeta.get(process.getOwnerScript()) } + .unique() + .collectMany { meta -> + meta.getDefinitions().findAll { defn -> defn instanceof ProcessDef } as List + } + + final processLookup = taskProcessors + .inject([:] as Map) { acc, processor -> + // HACK: when the owner script of a processor defines only one process, that must be the definition + final meta = ScriptMeta.get(processor.getOwnerScript()) + final defs = meta.getDefinitions().findAll { defn -> defn instanceof ProcessDef } as List + final processDef = defs.size() == 1 ? defs.first() : null + if( !processDef ) + log.warn "Could not identify process definition for `${processor.name}` -- resulting RO-Crate may be invalid (hint: define each process in a separate module script to fix this issue)" + acc[processor] = processDef + acc + } + + final moduleSoftwareApplications = processDefs + .collect() { process -> + final result = [ + "@id" : getModuleId(process), + "@type" : "SoftwareApplication", + "name" : process.baseName, + "url" : getModuleUrl(process), + ] + + final metaYaml = getModuleSchema(process) + if( metaYaml ) { + final name = metaYaml.name as String + final tools = metaYaml.getOrDefault('tools', []) as List + final parts = tools.collect { tool -> + final entry = (tool as Map).entrySet().first() + final toolName = entry.key as String + ["@id": getToolId(process.baseName, toolName)] + } + + if( name ) + result.name = name + if( parts ) + result.hasPart = parts + } + + return result + } + + final toolSoftwareApplications = processDefs + .collectMany { process -> + final metaYaml = getModuleSchema(process) + if( !metaYaml ) + return [] + + final tools = metaYaml.getOrDefault('tools', []) as List + return tools + .collect { tool -> + final entry = (tool as Map).entrySet().first() + final toolName = entry.key as String + final toolDescription = (entry.value as Map)?.get('description') as String + return [ + "@id" : getToolId(process.baseName, toolName), + "@type" : "SoftwareApplication", + "name" : toolName, + "description" : toolDescription + ] + } + } + + final howToSteps = taskProcessors + .collect() { process -> + final processDef = processLookup[process] + [ + "@id" : getProcessStepId(process), + "@type" : "HowToStep", + "workExample": processDef ? ["@id": getModuleId(processDef)] : null, + "position" : process.getId() + ] + } + + final controlActions = taskProcessors + .collect() { process -> + final taskIds = tasks + .findAll { task -> task.processor == process } + .collect { task -> ["@id": getTaskId(task)] } + + return [ + "@id" : getProcessControlId(process), + "@type" : "ControlAction", + "instrument": ["@id": getProcessStepId(process)], + "name" : "Orchestrate process ${process.name}", + "object" : taskIds + ] + } + + // -- workflow execution + final stagedInputs = workflowInputs + .findAll { source -> ProvHelper.isStagedInput(source, session) } + .collect { source -> + final name = getStagedInputName(source, session) + + withoutNulls([ + "@id" : "#stage/${name}", + "@type" : "CreativeWork", + "name" : name, + "encodingFormat": getEncodingFormat(source), + ]) + } + + final taskCreateActions = tasks + .collect { task -> + final processDef = processLookup[task.processor] + final inputs = task.getInputFilesMap().collect { name, source -> + final id = + source in taskLookup ? getTaskOutputId(taskLookup[source], source) + : ProvHelper.isStagedInput(source, session) ? "#stage/${getStagedInputName(source, session)}" + : normalizePath(source) + ["@id": id] + } + final outputs = ProvHelper.getTaskOutputs(task).collect { target -> + ["@id": getTaskOutputId(task, target)] + } + final result = [ + "@id" : getTaskId(task), + "@type" : "CreateAction", + "name" : task.name, + "instrument" : processDef ? ["@id": getModuleId(processDef)] : null, + "agent" : agent ? ["@id": agent["@id"]] : null, + "object" : inputs, + "result" : outputs, + "actionStatus": task.exitStatus == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" + ] + if( task.exitStatus != 0 ) + result["error"] = task.stderr + return result + } + + final taskOutputs = tasks.collectMany { task -> + ProvHelper.getTaskOutputs(task).collect { target -> + final name = getTaskOutputName(task, target) + + return withoutNulls([ + "@id" : getTaskOutputId(task, name), + "@type" : "CreativeWork", + "name" : name, + "encodingFormat": getEncodingFormat(target), + ]) + } + } + + final publishCreateActions = workflowOutputs + .collect { source, target -> + final task = taskLookup[source] + final sourceName = getTaskOutputName(task, source) + + return [ + "@id" : "#publish/${task.hash}/${sourceName}", + "@type" : "CreateAction", + "name" : "publish", + "instrument" : ["@id": softwareApplicationId], + "object" : ["@id": getTaskOutputId(task, sourceName)], + "result" : ["@id": crateDir.relativize(target).toString()], + "actionStatus": "http://schema.org/CompletedActionStatus" + ] + } + + final wrroc = [ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph" : withoutNulls([ + [ + "@id" : path.name, + "@type" : "CreativeWork", + "about" : ["@id": "./"], + "conformsTo": [ + ["@id": "https://w3id.org/ro/crate/1.1"], + ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] + ] + ], + withoutNulls([ + "@id" : "./", + "@type" : "Dataset", + "author" : agent ? ["@id": agent["@id"]] : null, + "publisher" : publisherId ? ["@id": publisherId] : null, + "datePublished": getDatePublished(), + "conformsTo" : [ + ["@id": "https://w3id.org/ro/wfrun/process/0.1"], + ["@id": "https://w3id.org/ro/wfrun/workflow/0.1"], + ["@id": "https://w3id.org/ro/wfrun/provenance/0.1"], + ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] + ], + "name" : "Workflow run of ${manifest.name ?: metadata.projectName}", + "description": manifest.description ?: null, + "hasPart" : withoutNulls([ + ["@id": mainScriptId], + *asReferences(datasetParts), + *asReferences(inputFiles), + *asReferences(outputFiles) + ]), + "mainEntity" : ["@id": mainScriptId], + "mentions" : [ + ["@id": "#${session.uniqueId}"], + *asReferences(stagedInputs), + *asReferences(taskCreateActions), + *asReferences(taskOutputs), + *asReferences(publishCreateActions), + ], + "license" : manifest.license + ]), + [ + "@id" : "https://w3id.org/ro/wfrun/process/0.1", + "@type" : "CreativeWork", + "name" : "Process Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/ro/wfrun/workflow/0.1", + "@type" : "CreativeWork", + "name" : "Workflow Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/ro/wfrun/provenance/0.1", + "@type" : "CreativeWork", + "name" : "Provenance Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/workflowhub/workflow-ro-crate/1.0", + "@type" : "CreativeWork", + "name" : "Workflow RO-Crate", + "version": "1.0" + ], + withoutNulls([ + "@id" : mainScriptId, + "@type" : ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], + "conformsTo" : ["@id": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE"], + "name" : manifest.name ?: metadata.projectName, + "description" : manifest.description, + "programmingLanguage": ["@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"], + "creator" : manifest.author, + "codeRepository" : metadata.repository, + "version" : metadata.commitId, + "license" : manifest.license, + "url" : metadata.repository ? normalizePath(metadata.scriptFile) : null, + "encodingFormat" : "application/nextflow", + "runtimePlatform" : "Nextflow " + nextflowVersion, + "hasPart" : asReferences(moduleSoftwareApplications), + "input" : asReferences(formalParameters), + "output" : [ + // TODO: workflow output targets + ], + "step" : asReferences(howToSteps), + ]), + [ + "@id" : "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", + "@type" : "ComputerLanguage", + "name" : "Nextflow", + "identifier": "https://www.nextflow.io/", + "url" : "https://www.nextflow.io/", + "version" : nextflowVersion + ], + [ + "@id" : softwareApplicationId, + "@type": "SoftwareApplication", + "name" : "Nextflow ${nextflowVersion}" + ], + *moduleSoftwareApplications, + *toolSoftwareApplications, + *formalParameters, + *howToSteps, + [ + "@id" : organizeActionId, + "@type" : "OrganizeAction", + "agent" : agent ? ["@id": agent["@id"]] : null, + "instrument": ["@id": softwareApplicationId], + "name" : "Run of Nextflow ${nextflowVersion}", + "object" : asReferences(controlActions), + "result" : ["@id": "#${session.uniqueId}"], + "startTime" : dateStarted, + "endTime" : dateCompleted + ], + [ + "@id" : "#${session.uniqueId}", + "@type" : "CreateAction", + "agent" : agent ? ["@id": agent["@id"]] : null, + "name" : "Nextflow workflow run ${session.uniqueId}", + "startTime" : dateStarted, + "endTime" : dateCompleted, + "instrument": ["@id": mainScriptId], + "object" : [ + *asReferences(propertyValues), + *asReferences(inputFiles), + ], + "result" : asReferences(outputFiles) + ], + agent, + organization, + *contactPoints, + *datasetParts, + *propertyValues, + *controlActions, + *stagedInputs, + *taskCreateActions, + *taskOutputs, + *publishCreateActions, + *inputFiles, + *outputFiles, + ]) + ] + + // render manifest to JSON file + path.text = JsonOutput.prettyPrint(JsonOutput.toJson(wrroc)) + } + + private static String getDatePublished() { + return LocalDateTime.now().format(DateTimeFormatter.ISO_DATE) + } + + /** + * Parse information about the agent running the workflow. + * + * @param opts + */ + private Map getAgentInfo(Map opts) { + final result = [:] + + if( !opts.agent ) + return null + + final agentOpts = opts.agent as Map + result["@id"] = agentOpts.getOrDefault("orcid", "agent-1") + result["@type"] = "Person" + if( agentOpts.name ) + result.name = agentOpts.name + + // Check for contact information + if( agentOpts.email || agentOpts.phone ) { + final contactPointId = getContactPointInfo(agentOpts) + if( contactPointId ) + result.contactPoint = ["@id": contactPointId] + } + + return result + } + + /** + * Parse information about the organization of the agent running the workflow. + * + * @param opts + */ + private Map getOrganizationInfo(Map opts) { + final result = [:] + + if( !opts.organization ) + return null + + final orgOpts = opts.organization as Map + result["@id"] = orgOpts.getOrDefault("ror", "organization-1") + result["@type"] = "Organization" + if( orgOpts.name ) + result.name = orgOpts.name + + // Check for contact information + if( orgOpts.email || orgOpts.phone ) { + final contactPointId = getContactPointInfo(orgOpts) + if( contactPointId ) + result.contactPoint = ["@id": contactPointId] + } + + return result + } + + /** + * Parse a contact point and add it to the list of contact points. + * + * @param opts + */ + private String getContactPointInfo(Map opts) { + // Prefer email for the contact point ID + String contactPointId = null + if( opts.email ) + contactPointId = "mailto:" + opts.email + else if( opts.phone ) + contactPointId = opts.phone + + if( !contactPointId ) + return null + + final contactPoint = [:] + contactPoint["@id"] = contactPointId + contactPoint["@type"] = "ContactPoint" + if( opts.contactType ) + contactPoint.contactType = opts.contactType + if( opts.email ) + contactPoint.email = opts.email + if( opts.phone ) + contactPoint.phone = opts.phone + if( opts.orcid ) + contactPoint.url = opts.orcid + if( opts.ror ) + contactPoint.url = opts.ror + + contactPoints.add(contactPoint) + return contactPointId + } + + /** + * Parse information about the RO-Crate publisher. + * + * @param opts + * @param agent + * @param organization + */ + private static String getPublisherId(Map opts, Map agent, Map organization) { + if( !opts.publisher ) + return null + + final publisherId = opts.publisher + + // Check if the publisher id references either the agent or the organization + final agentId = agent?["@id"] + final organizationId = organization?["@id"] + if( publisherId != agentId && publisherId != organizationId ) + return null + + return publisherId + } + + /** + * Get the parameter schema of a pipeline as a map. + * + * @param path + */ + private static Map getParameterSchema(Path path) { + final schema = new JsonSlurper().parseText(path.text) as Map + + Map defs = null + if( schema['$defs'] ) + defs = schema['$defs'] as Map + else if( schema['defs'] ) + defs = schema['defs'] as Map + else if( schema['definitions'] ) + defs = schema['definitions'] as Map + + if( !defs ) + return [:] + + final schemaProps = schema.properties as Map ?: [:] + final defsProps = defs.values().collect { defn -> + (defn as Map).properties ?: [:] + } as List + final allProps = [schemaProps] + defsProps + final entries = allProps.collectMany { props -> + (props as Map).entrySet() + } as Map.Entry[] + + return Map.ofEntries(entries) + } + + /** + * Determine the type of a parameter based on its + * schema and/or runtime value. + * + * @param name + * @param value + * @param schema + */ + private static String getParameterType(String name, Object value, Map schema) { + // infer from schema + if( schema ) { + final type = schema.type + final format = schema.format + + switch( type ) { + case "boolean": + return "Boolean" + case "integer": + case "number": + return "Number" + case "string": + return \ + format == "file-path" ? "File" : + format == "directory-path" ? "Dataset" : + "Text" + } + } + + // infer from runtime value + switch( value ) { + case Boolean: + return "Boolean" + case Number: + return "Number" + case CharSequence: + return "Text" + case List: + case Map: + return "Text" + default: + return null + } + } + + /** + * Get the canonical id of a module script. + * + * @param name + */ + private String getFormalParameterId(String name) { + return "#param/${name}" + } + + /** + * Get the canonical id of a module script. + * + * @param process + */ + private String getModuleId(ProcessDef process) { + return "#module/${process.baseName}" + } + + /** + * Get the canonical url of a module script. + * + * @param process + */ + private String getModuleUrl(ProcessDef process) { + final scriptPath = ScriptMeta.get(process.getOwner()).getScriptPath().normalize() + return normalizePath(scriptPath) + } + + /** + * Get the canonical id of a tool used by a module. + * + * @param moduleName + * @param toolName + */ + private static String getToolId(String moduleName, String toolName) { + return "#module/${moduleName}/${toolName}" + } + + /** + * Get the canonical id of a process in the workflow DAG. + * + * @param process + */ + private static String getProcessControlId(TaskProcessor process) { + return "#process-control/${process.name}" + } + + private static String getProcessStepId(TaskProcessor process) { + return "#process-step/${process.name}" + } + + /** + * Get the relative name of a staged input. + * + * @param source + * @param session + */ + private static String getStagedInputName(Path source, Session session) { + final stageDir = ProvHelper.getStageDir(session) + return stageDir.relativize(source).toString() + } + + /** + * Get the canonical id of a task. + * + * @param task + */ + private static String getTaskId(TaskRun task) { + return "#task/${task.hash}" + } + + /** + * Get the relative name of a task output. + * + * @param task + * @param target + */ + private static String getTaskOutputName(TaskRun task, Path target) { + final workDir = task.workDir.toUriString() + return target.toUriString().replace(workDir + '/', '') + } + + /** + * Get the canonical id of a task output. + * + * @param task + * @param name + */ + private static String getTaskOutputId(TaskRun task, String name) { + return "#task/${task.hash}/${name}" + } + + private static String getTaskOutputId(TaskRun task, Path target) { + return "#task/${task.hash}/${getTaskOutputName(task, target)}" + } + + /** + * Get the nf-core meta.yml of a Nextflow module as a map. + * + * @param process + */ + private static Map getModuleSchema(ProcessDef process) { + final metaFile = ScriptMeta.get(process.getOwner()).getModuleDir().resolve('meta.yml') + return Files.exists(metaFile) + ? new Yaml().load(metaFile.text) as Map + : null + } + + /** + * Get the RO-Crate "@type" of a path based on whether + * it is a file or directory. + * + * @param path + */ + private static String getType(Path path) { + return path.isDirectory() + ? "Dataset" + : "File" + } + + /** + * Get the encodingFormat of a file as MIME Type. + * + * @param path Path to file + * @return the MIME type of the file, or null if it's not a file. + */ + private static String getEncodingFormat(Path path) { + if( !(path && path.exists() && path.isFile()) ) + return null + + String mime = Files.probeContentType(path) + if( mime ) + return mime + + // It seems that YAML has a media type only since beginning of 2024 + // Set this by hand if this is run on older systems: + // https://httptoolkit.com/blog/yaml-media-type-rfc/ + if( ["yml", "yaml"].contains(path.getExtension()) ) + return "application/yaml" + + return null + } + + private static List asReferences(List values) { + return values.collect { value -> ["@id": value["@id"]] } + } + + private static List withoutNulls(List list) { + return list.findAll { v -> v != null } + } + + private static Map withoutNulls(Map map) { + return map.findAll { k, v -> v != null } + } + +}