diff --git a/Snakefile b/Snakefile index d3e8a363..38c02d0d 100644 --- a/Snakefile +++ b/Snakefile @@ -144,7 +144,7 @@ rule download_cgi: output: f'{DATA_DIR}/cgi/cgi_biomarkers_per_variant.tsv' shell: dedent(f'''\ cd {DATA_DIR}/cgi - wget https://www.cancergenomeinterpreter.org/data/cgi_biomarkers_20180117.zip + wget https://www.cancergenomeinterpreter.org/data/biomarkers/cgi_biomarkers_20180117.zip unzip cgi_biomarkers_20180117.zip ''') diff --git a/src/dgidb/README.md b/src/dgidb/README.md index 38ab659a..d0a6c899 100644 --- a/src/dgidb/README.md +++ b/src/dgidb/README.md @@ -3,3 +3,7 @@ This loader pulls data into GraphKB using the [DGIdb](https://www.dgidb.org/) API. > :warning: Since this loads gene-drug relationships. Loaders for genes and drugs should be run beforehand + +```bash +node bin/load.js api dgidb +``` diff --git a/src/dgidb/index.js b/src/dgidb/index.js index 48690620..742bc98a 100644 --- a/src/dgidb/index.js +++ b/src/dgidb/index.js @@ -14,35 +14,53 @@ const ajv = new Ajv(); const recordSpec = ajv.compile(spec); -const BASE_URL = 'https://dgidb.org/api/v2'; +const BASE_URL = 'https://dgidb.org/api/graphql'; -const processRecord = async ({ conn, record, source }) => { +const processRecord = async ({ + conn, record, source, counts, +}) => { checkSpec(recordSpec, record); - const { - entrez_id: entrezId, - concept_id: chemblId, - interaction_types: interactionTypes, - id, - } = record; - - const [gene] = await _entrezGene.fetchAndLoadByIds(conn, [entrezId]); + const { node: { id, conceptId: chemblId, interactions } } = record; + const drug = await _chembl.fetchAndLoadById(conn, chemblId.replace('chembl:', '')); - const interactionType = interactionTypes.map(i => i.toLowerCase().trim()).sort().join(';'); - - await conn.addRecord({ - content: { - actionType: interactionType, - in: rid(drug), - out: rid(gene), - source: rid(source), - uuid: id, // use the input uuid as the uuid rather than generating one - }, - existsOk: true, - fetchExisting: false, - target: 'TargetOf', - }); + for (const interaction of interactions) { + const { gene: { conceptId, name }, interactionTypes } = interaction; + const interactionType = interactionTypes.map(item => item.type).sort().join(';'); + + let geneRid; + + if (conceptId.split(':')[0] === 'hgnc') { + const hgncRecord = await conn.getUniqueRecordBy({ + filters: { + AND: [{ displayName: name }, { sourceId: conceptId }, + { source: { filters: { name: 'hgnc' }, target: 'Source' } }], + }, + returnProperies: ['out_CrossReferenceOf'], + target: 'Feature', + }); + geneRid = hgncRecord.out_CrossReferenceOf[0].in; + } else { + logger.info(`skip unrecognized concept id: ${conceptId}`); + counts.skip++; + } + + if (geneRid !== undefined) { + await conn.addRecord({ + content: { + actionType: interactionType, + in: rid(drug), + out: geneRid, + source: rid(source), + uuid: id, // use the input uuid as the uuid rather than generating one + }, + existsOk: true, + fetchExisting: false, + target: 'TargetOf', + }); + } + } }; @@ -50,31 +68,60 @@ const upload = async ({ conn, url = BASE_URL }) => { logger.info('creating the source record'); const source = rid(await conn.addSource(SOURCE_DEFN)); const limit = 100; - let page = `${url}/interactions?count=${limit}&page=1`; const counts = { error: 0, skip: 0, success: 0 }; - // pre-cache the entrez genes - logger.info('pre-loading the entrez gene list'); - await _entrezGene.preLoadCache(conn); logger.info('pre-loading the chembl drug list'); await _chembl.preLoadCache(conn); + let endCursor = '', + hasNextPage = true; - while (page) { - logger.info(`loading: ${page}`); + while (hasNextPage) { const resp = await request({ + body: { + query: `{ + drugs(first:${limit}${endCursor}) { + pageInfo { + endCursor + hasNextPage + } + pageCount + edges { + cursor + node { + id + conceptId + interactions { + gene { + name + conceptId + longName + } + interactionTypes { + type + } + } + } + } + } + }`, + }, json: true, - method: 'GET', - uri: page, + method: 'POST', + uri: url, }); - const { _meta: { links: { next } }, records } = resp; - page = next; - // process this batch of records - for (const record of records) { - logger.info(`processing ${record.id}`); + const { data: { drugs: { edges, pageInfo } } } = resp; + + endCursor = ` after:"${pageInfo.endCursor}"`; + hasNextPage = pageInfo.hasNextPage; + + for (const record of edges) { + logger.info(`processing ${record.cursor}`); try { - await processRecord({ conn, record, source }); + await processRecord({ + conn, counts, record, source, + }); counts.success++; } catch (err) { logger.error(err); diff --git a/src/dgidb/spec.json b/src/dgidb/spec.json index aad7d801..6e26cce1 100644 --- a/src/dgidb/spec.json +++ b/src/dgidb/spec.json @@ -1,47 +1,48 @@ { "properties": { - "concept_id": { - "pattern": "^chembl:CHEMBL\\d+$", + "cursor":{ "type": "string" }, - "entrez_id": { - "min": 1, - "type": "number" - }, - "id": { - "format": "uuid", - "type": "string" - }, - "interaction_direction": { - "items": { - "type": [ - "string", - "null" - ] - }, - "type": "array" - }, - "interaction_types": { - "items": { - "type": "string" - }, - "type": "array" - }, - "score": { - "type": "number" - }, - "sources": { - "items": { - "type": "string" + "node": { + "properties": { + "conceptId": { + "pattern": "^chembl:CHEMBL\\d+$", + "type": "string" + }, + "id": { + "format": "uuid", + "type": "string" + }, + "interactions": { + "items": { + "properties": { + "gene": { + "properties": { + "conceptId": { + "type": "string" + } + } + }, + "interactionTypes": { + "items": { + "properties": { + "type": { + "type": "string" + } + } + }, + "type": "array" + } + } + }, + "type": "array" + } }, - "type": "array" + "required": [ + "conceptId" + ], + "type": "object" } }, - "required": [ - "entrez_id", - "concept_id", - "interaction_types", - "id" - ], "type": "object" }