Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TASK-5789 - Add dbSNP to CellBase #689

Merged
merged 23 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e49d994
Implement dbSNP file download as 'variation' TASK-5794
imedina Mar 11, 2024
26a008a
Implement dbSNP file download as 'variation' TASK-5794
imedina Mar 11, 2024
a267dc3
Implement VariationBuilder TASK-5794
imedina Mar 11, 2024
c583edd
lib: minor changes when downloading dbSNP data, #TASK-5815, #TASK-5789
jtarraga Mar 11, 2024
b75b1f7
lib: some improvements in downloading dbSNP data, #TASK-5816, #TASK-5789
jtarraga Mar 11, 2024
4e68dab
lib: load dbSNP data in the CellBase MongoDB collection 'snp', #TASK-…
jtarraga Mar 11, 2024
2d6ca96
lib: implement SnpMongoDBAdaptor, #TASK-5794, TASK-5789
jtarraga Mar 11, 2024
56b3eb6
server: add endpoint 'snp' and update endpotins variant/search varian…
jtarraga Mar 11, 2024
82e57f3
Several improvements
imedina Mar 12, 2024
bbccffe
Final dbSNP builder implemented
imedina Mar 12, 2024
692cad2
lib: update VariantMongoDBAdaptor according to the SNP biodata change…
jtarraga Mar 12, 2024
90a4f68
lib: fix NPE, #TASK-5816, #TASK-5789
jtarraga Mar 12, 2024
36c64fe
server: add the endpoints variant/snp/search and variant/snp/startsWi…
jtarraga Mar 13, 2024
7670702
server: update variant annotator to return the dbSNP IDs in the field…
jtarraga Mar 13, 2024
f206fe5
lib: the variant annotation calculator search SNP ids only if the col…
jtarraga Mar 13, 2024
2dbd3b4
lib: normalize variants created by SNPs before searching in the varia…
jtarraga Mar 15, 2024
0b173e9
Fix some sonnar issues, #TASK-5789
jtarraga Mar 15, 2024
ba59eaa
Merge branch 'release-5.8.x' into TASK-5789
jtarraga Apr 9, 2024
8e5f1b4
client: update VariantClient to take into account the dbSNP endpoints…
jtarraga Apr 12, 2024
4cf6541
Merge branch 'TASK-5789' of https://github.com/opencb/cellbase into T…
jtarraga Apr 12, 2024
377c0b7
client: use JUnit 5, #TASK-6020, #TASK-5789
jtarraga Apr 12, 2024
1777ca2
lib: check and remove duplicated xrefs/dbSNP, #TASK-5821, #TASK-5789
jtarraga Apr 16, 2024
30f167c
test: improve JUnit tests, and exception management, #TASK-6020, #TAS…
jtarraga Apr 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import com.beust.jcommander.*;
import org.opencb.cellbase.app.cli.CliOptionsParser;
import org.opencb.cellbase.core.api.key.ApiKeyQuota;
import org.opencb.cellbase.lib.EtlCommons;

import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -74,6 +75,7 @@ public AdminCliOptionsParser() {
jCommander.addCommand("validate", validationCommandOptions);
}

@Override
public void parse(String[] args) throws ParameterException {
jCommander.parse(args);
}
Expand All @@ -87,9 +89,13 @@ public class DownloadCommandOptions {
@ParametersDelegate
public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions;

@Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, "
+ "variation, variation_functional_score, regulation, protein, conservation, "
+ "clinical_variants, repeats, svs, pubmed and 'all' to download everything", required = true, arity = 1)
@Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download:"
+ EtlCommons.GENOME_DATA + ", " + EtlCommons.GENE_DATA + ", " + EtlCommons.VARIATION_DATA + ", "
+ EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA + ", " + EtlCommons.MISSENSE_VARIATION_SCORE_DATA + ", "
+ EtlCommons.REGULATION_DATA + ", " + EtlCommons.PROTEIN_DATA + ", " + EtlCommons.CONSERVATION_DATA + ", "
+ EtlCommons.CLINICAL_VARIANTS_DATA + ", " + EtlCommons.REPEATS_DATA + ", " + EtlCommons.OBO_DATA + ", "
+ EtlCommons.PUBMED_DATA + ", " + EtlCommons.PHARMACOGENOMICS_DATA + "; and 'all' to download everything",
required = true, arity = 1)
public String data;

@Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import java.util.Collections;
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.PHARMGKB_DATA;
import static org.opencb.cellbase.lib.EtlCommons.*;

/**
* Created by imedina on 03/02/15.
Expand Down Expand Up @@ -132,6 +132,9 @@ public void execute() {
case EtlCommons.REFSEQ_DATA:
parser = buildRefSeq();
break;
case EtlCommons.VARIATION_DATA:
parser = buildVariation();
break;
case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA:
parser = buildCadd();
break;
Expand Down Expand Up @@ -275,6 +278,21 @@ private CellBaseBuilder buildRefSeq() {
return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer);
}

private CellBaseBuilder buildVariation() throws IOException {
Path downloadVariationPath = downloadFolder.resolve(VARIATION_DATA);
Path buildVariationPath = buildFolder.resolve(VARIATION_DATA);
if (!buildVariationPath.toFile().exists()) {
buildVariationPath.toFile().mkdirs();
}

CellBaseFileSerializer variationSerializer = new CellBaseJsonFileSerializer(buildVariationPath);

// Currently, only dbSNP data
Files.copy(downloadVariationPath.resolve(DBSNP_VERSION_FILENAME), buildVariationPath.resolve(DBSNP_VERSION_FILENAME),
StandardCopyOption.REPLACE_EXISTING);
return new VariationBuilder(downloadVariationPath, variationSerializer, configuration);
}

private CellBaseBuilder buildCadd() {
Path variationFunctionalScorePath = downloadFolder.resolve("variation_functional_score");
copyVersionFiles(Arrays.asList(variationFunctionalScorePath.resolve("caddVersion.json")));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ public void execute() {
case EtlCommons.GENE_DATA:
downloadFiles.addAll(downloader.downloadGene());
break;
// case EtlCommons.VARIATION_DATA:
// downloadManager.downloadVariation();
// break;
case EtlCommons.VARIATION_DATA:
downloadFiles.addAll(downloader.downloadVariation());
break;
case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA:
downloadFiles.addAll(downloader.downloadCaddScores());
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
import java.util.List;
import java.util.concurrent.ExecutionException;

import static org.opencb.cellbase.lib.EtlCommons.*;

/**
* Created by imedina on 03/02/15.
*/
Expand Down Expand Up @@ -372,30 +374,57 @@ private void checkParameters() throws CellBaseException {
private void loadVariationData() throws NoSuchMethodException, InterruptedException, ExecutionException,
InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException,
IOException, LoaderException, CellBaseException {
Path variationPath = input.resolve(VARIATION_DATA);
// First load data
// Common loading process from CellBase variation data models
if (field == null) {
DirectoryStream<Path> stream = Files.newDirectoryStream(input,
// Common loading process from CellBase variation data models
DirectoryStream<Path> stream = Files.newDirectoryStream(variationPath,
entry -> entry.getFileName().toString().startsWith("variation_chr"));

int numLoadings = 0;
for (Path entry : stream) {
logger.info("Loading file '{}'", entry);
loadRunner.load(input.resolve(entry.getFileName()), "variation", dataRelease);
loadRunner.load(variationPath.resolve(entry.getFileName()), "variation", dataRelease);
numLoadings++;
}

// Create index
createIndex("variation");

// Update release (collection and sources)
List<Path> sources = new ArrayList<>(Arrays.asList(
input.resolve("ensemblVariationVersion.json")
));
dataReleaseManager.update(dataRelease, "variation", EtlCommons.VARIATION_DATA, sources);
if (numLoadings > 0) {
// Create index
createIndex("variation");

// Update release (collection and sources)
List<Path> sources = new ArrayList<>(Arrays.asList(
variationPath.resolve("ensemblVariationVersion.json")
));
dataReleaseManager.update(dataRelease, "variation", EtlCommons.VARIATION_DATA, sources);
} else {
logger.info("Any variation file 'variation_chr...' found within folder '{}'", variationPath);
}
} else {
// Custom update required e.g. population freqs loading
logger.info("Loading file '{}'", variationPath);
loadRunner.load(variationPath, "variation", dataRelease, field, innerFields);
}

// Load dbSNP
Path dbSnpFilePath = variationPath.resolve(DBSNP_NAME + ".json.gz");
if (dbSnpFilePath.toFile().exists()) {
if (variationPath.resolve(DBSNP_VERSION_FILENAME).toFile().exists()) {
logger.info("Loading dbSNP file '{}'", dbSnpFilePath);
loadRunner.load(dbSnpFilePath, SNP_COLLECTION_NAME, dataRelease);

// Create index
createIndex(SNP_COLLECTION_NAME);

// Update release (collection and sources)
List<Path> sources = Collections.singletonList(variationPath.resolve(DBSNP_VERSION_FILENAME));
dataReleaseManager.update(dataRelease, SNP_COLLECTION_NAME, EtlCommons.VARIATION_DATA, sources);
} else {
logger.warn("In order to load the dbSNP file you need the version file {} within the folder '{}'", DBSNP_VERSION_FILENAME,
variationPath);
}
} else {
logger.info("Loading file '{}'", input);
loadRunner.load(input, "variation", dataRelease, field, innerFields);
logger.warn("Any dbSNP file found within the folder '{}'", variationPath);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.opencb.cellbase.core.api;

import org.opencb.cellbase.core.api.query.AbstractQuery;
import org.opencb.cellbase.core.api.query.QueryException;
import org.opencb.cellbase.core.api.query.QueryParameter;

import java.util.List;
import java.util.Map;

public class SnpQuery extends AbstractQuery {

@QueryParameter(id = "id")
private List<String> ids;
@QueryParameter(id = "chromosome")
private String chromosome;
@QueryParameter(id = "position")
private Integer position;
@QueryParameter(id = "reference")
private String reference;

public SnpQuery() {
}

public SnpQuery(Map<String, String> params) throws QueryException {
super(params);
}

@Override
protected void validateQuery() {
// nothing to validate
return;
}

@Override
public String toString() {
final StringBuilder sb = new StringBuilder("SnpQuery{");
sb.append("ids=").append(ids);
sb.append(", chromosome='").append(chromosome).append('\'');
sb.append(", position='").append(position).append('\'');
sb.append(", reference='").append(reference).append('\'');
sb.append('}');
return sb.toString();
}

public List<String> getIds() {
return ids;
}

public SnpQuery setIds(List<String> ids) {
this.ids = ids;
return this;
}

public String getChromosome() {
return chromosome;
}

public SnpQuery setChromosome(String chromosome) {
this.chromosome = chromosome;
return this;
}

public Integer getPosition() {
return position;
}

public SnpQuery setPosition(Integer position) {
this.position = position;
return this;
}

public String getReference() {
return reference;
}

public SnpQuery setReference(String reference) {
this.reference = reference;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public class DownloadProperties {
private URLProperties clinvarSummary;
private URLProperties clinvarVariationAllele;
private URLProperties clinvarEfoTerms;
private URLProperties dbSNP;
private URLProperties iarctp53;
private URLProperties docm;
private URLProperties docmVersion;
Expand Down Expand Up @@ -263,6 +264,15 @@ public DownloadProperties setClinvarEfoTerms(URLProperties clinvarEfoTerms) {
return this;
}

public URLProperties getDbSNP() {
return dbSNP;
}

public DownloadProperties setDbSNP(URLProperties dbSNP) {
this.dbSNP = dbSNP;
return this;
}

public URLProperties getIarctp53() {
return iarctp53;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,8 @@ public CellBaseException(String msg) {
super(msg);
}

public CellBaseException(String msg, Exception e) {
super(msg, e);
}
}

4 changes: 4 additions & 0 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ download:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz
clinvarEfoTerms:
host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv
dbSNP:
host: https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz
version: "156"
iarctp53:
host: http://p53.iarc.fr/ajax/Zipper.ashx
docm:
Expand Down Expand Up @@ -197,6 +200,7 @@ species:
- refseq
- regulation
- repeats
- variation
- variation_functional_score
- splice_score
shards:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,11 @@ public class EtlCommons {
public static final String IARCTP53_FILE = "IARC-TP53.zip";
public static final String GWAS_FILE = "gwas_catalog.tsv";
public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz";
public static final String DBSNP_FILE = "All.vcf.gz";
@Deprecated
public static final String DBSNP_FILE = "GCF_000001405.40.gz";
public static final String DBSNP_NAME = "dbSNP";
public static final String DBSNP_VERSION_FILENAME = DBSNP_NAME + "Version.json";
public static final String SNP_COLLECTION_NAME = "snp";

public static final String STRUCTURAL_VARIANTS_DATA = "svs";
public static final String REPEATS_DATA = "repeats";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) {
super(serializer);
this.caddFilePath = caddFilePath;

logger = LoggerFactory.getLogger(ConservationBuilder.class);
logger = LoggerFactory.getLogger(CaddScoreBuilder.class);
}

/* Example:
Expand Down
Loading
Loading