Skip to content

Commit

Permalink
Merge branch 'release-6.2.x' into TASK-6647
Browse files Browse the repository at this point in the history
  • Loading branch information
juanfeSanahuja committed Aug 5, 2024
2 parents 0f2a6d2 + 386a510 commit bb51d33
Show file tree
Hide file tree
Showing 23 changed files with 509 additions and 82 deletions.
54 changes: 54 additions & 0 deletions .github/workflows/manual-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Manual Junit test the project

on:
workflow_dispatch:
inputs:
branch:
description: 'Opencga branch to run the tests'
default: 'develop'
required: true
fail-never:
type: boolean
description: 'The process executes all tests even if some fail.'
default: false
required: false

jobs:
test:
name: JUnit Test
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.branch }}
fetch-depth: '0'
- name: Set up JDK 11
uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: '11'
cache: 'maven'
- name: K8s Tunnel MongoDB
run: |
wget https://dl.k8s.io/release/v1.28.2/bin/linux/amd64/kubectl
chmod +x ./kubectl
echo "${{ secrets.AZURE_KUBE_CONFIG }}" > admin.conf
./kubectl -n cellbase-db port-forward services/cellbase-rs0-svc 27017:27017 --kubeconfig ./admin.conf &
- name: Install dependencies branches
run: |
if [ -f "./.github/workflows/scripts/get_same_branch.sh" ]; then
chmod +x ./.github/workflows/scripts/get_same_branch.sh
./.github/workflows/scripts/get_same_branch.sh ${{ github.ref_name }}
else
echo "./.github/workflows/scripts/get_same_branch.sh does not exist."
fi
- name: Test and Analyze
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
run: |
FAIL_NEVER=""
if [ "${{ github.event.inputs.fail-never }}" == "true" ]; then
FAIL_NEVER="--fail-never"
fi
mvn install surefire-report:report ${FAIL_NEVER} -Dcheckstyle.skip
15 changes: 15 additions & 0 deletions .github/workflows/pull-request-approved.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Pull request approve workflow

on:
pull_request_review:
types: [ submitted ]

jobs:
build:
uses: opencb/java-common-libs/.github/workflows/build-java-app-workflow.yml@develop

test:
name: "Test analysis"
uses: ./.github/workflows/test-analysis.yml
needs: build
secrets: inherit
4 changes: 2 additions & 2 deletions .github/workflows/test-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ jobs:
name: Test and push Sonar analysis
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
fetch-depth: '0'
- name: Set up JDK 11
uses: actions/setup-java@v3
uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: '11'
Expand Down
2 changes: 1 addition & 1 deletion cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ RUN cd /opt/ensembl && \
git clone https://github.com/Ensembl/ensembl-compara.git && \
git clone https://github.com/Ensembl/ensembl-io.git

ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase
ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts
8 changes: 4 additions & 4 deletions cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157";
our $ENSEMBL_GENOMES_USER = "anonymous";

## Vertebrates
our $HOMO_SAPIENS_CORE = "homo_sapiens_core_104_38";
our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_104_38";
our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_104_38";
our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_104_38";
our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38";
our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38";
our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38";
our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38";
#our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38";
#our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38";
#our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38";
Expand Down
10 changes: 10 additions & 0 deletions cellbase-app/app/scripts/gnomad/mitochondrial/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
gnomAD Mitochondrial DNA (mtDNA) variants v3.1:
URL: https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1/vcf/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz

Mapping file in ticket BIOINFO-99: mapping_file_gnomad_mt_mod_file.txt

Script to preprocess original VCF from gnomad: gnomad_mt.py

Script to load gnomad mt variants into OpenCGA and export them in json format annotation.populationFrequencies object: opencga_gnomad_mt.sh


120 changes: 120 additions & 0 deletions cellbase-app/app/scripts/gnomad/mitochondrial/gnomad_mt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import sys
import gzip


POPULATIONS = ['afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'nfe', 'oth', 'sas', 'mid']
HEADER_COMMON = [
'##INFO=<ID=AC,Number=1,Type=Integer,Description="Calculated allele count">',
'##INFO=<ID=AF,Number=1,Type=Float,Description="Calculated allele frequency">',
'##INFO=<ID=GTC,Number=1,Type=String,Description="Calculated list of genotype counts (0/0,0/1,1/1)">'
]
HEADER_POP = [
'##INFO=<ID=AF_{pop},Number=1,Type=Float,Description="Calculated allele frequency for {pop} population">',
'##INFO=<ID=AC_{pop},Number=1,Type=Integer,Description="Calculated allele count for {pop} population">',
'##INFO=<ID=AN_{pop},Number=1,Type=Integer,Description="Calculated allele number for {pop} population">',
'##INFO=<ID=GTC_{pop},Number=1,Type=String,Description="Calculated list of genotype counts for {pop} population (0/0,0/1,1/1)">'
]


def main():

# Creating custom header
custom_header = []
custom_header += HEADER_COMMON
for pop in POPULATIONS:
custom_header += ['\n'.join(HEADER_POP).format(pop=pop)]
custom_header = '\n'.join(custom_header) + '\n'

# Opening input/output files
vcf_input_fpath = sys.argv[1]
vcf_output_fpath = sys.argv[2]
vcf_input_fhand = gzip.open(vcf_input_fpath, 'r')
vcf_output_fhand = gzip.open(vcf_output_fpath, 'wt')

# Calculating new INFO fields for each variant
for line in vcf_input_fhand:
line = line.decode()

# Writing header to output
if line.startswith('##VEP'): # adding custom header before "##VEP" line
vcf_output_fhand.write(custom_header)
vcf_output_fhand.write(line)
continue
if line.startswith('#'):
vcf_output_fhand.write(line)
continue

# Dict to store the new calculated data
new_info = {}

# Getting variant and INFO data
variant_items = line.strip().split()
info_items = variant_items[7].split(';')

for info_item in info_items:

# Getting key/value for each INFO item
if len(info_item.split('=', maxsplit=1)) < 2: # skipping flags
continue
info_key, info_value = info_item.split('=', maxsplit=1)

# Getting INFO data for calculations
if info_key == 'pop_AF_hom':
pop_AF_hom = list(map(float, info_value.split('|')))
if info_key == 'pop_AF_het':
pop_AF_het = list(map(float, info_value.split('|')))
if info_key == 'AF_hom':
AF_hom = float(info_value)
if info_key == 'AF_het':
AF_het = float(info_value)
if info_key == 'pop_AC_hom':
pop_AC_hom = list(map(int, info_value.split('|')))
if info_key == 'pop_AC_het':
pop_AC_het = list(map(int, info_value.split('|')))
if info_key == 'AC_hom':
AC_hom = int(info_value)
if info_key == 'AC_het':
AC_het = int(info_value)
if info_key == 'pop_AN':
pop_AN = list(map(int, info_value.split('|')))
if info_key == 'AN':
AN = int(info_value)

# Calculating AF_{pop} and AF
# e.g. AF_sas = pop_AF_hom[i] + pop_AF_het[i] (i = index of sas population)
pop_AF = [x + y for x, y in zip(pop_AF_hom, pop_AF_het)]
for i, pop in enumerate(POPULATIONS):
new_info['AF_' + pop] = pop_AF[i]
new_info['AF'] = AF_hom + AF_het

# Calculating AC_{pop} and AC
# e.g. AC_sas = pop_AC_hom[i] + pop_AC_het[i] (i = index of sas population)
pop_AC = [x + y for x, y in zip(pop_AC_hom, pop_AC_het)]
for i, pop in enumerate(POPULATIONS):
new_info['AC_' + pop] = pop_AC[i]
new_info['AC'] = AC_hom + AC_het

# Calculating AN_{pop}
# e.g. AN_sas = pop_AN[i] (i = index of sas population)
for i, pop in enumerate(POPULATIONS):
new_info['AN_' + pop] = pop_AN[i]

# Calculating GTC_{pop}
# e.g. GTC_sas = (pop_AN[i] - (pop_AC_het[i] + pop_AC_hom[i])) + "," + pop_AC_het[i] + "," + pop_AC_hom[i]
pop_AC = [x + y for x, y in zip(pop_AC_hom, pop_AC_het)]
hom_ref = [x - y for x, y in zip(pop_AN, pop_AC)]
for i, pop in enumerate(POPULATIONS):
new_info['GTC_' + pop] = ','.join(map(str, [hom_ref[i], pop_AC_het[i], pop_AC_hom[i]]))
new_info['GTC'] = ','.join(map(str, [AN - (AC_hom + AC_het), AC_het, AC_hom]))

# Joining existing INFO field and new custom INFO data
custom_info_data = ';'.join(['='.join([k, str(new_info[k])]) for k in new_info])
new_info_field = ';'.join(info_items + [custom_info_data])

# Replacing original INFO field
variant_items[7] = new_info_field
vcf_output_fhand.write('\t'.join(variant_items) + '\n')


if __name__ == '__main__':
sys.exit(main())
44 changes: 44 additions & 0 deletions cellbase-app/app/scripts/gnomad/mitochondrial/opencga_gnomad_mt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

# Variables
user="user"
host="host_name"
project="population"
project_name="Population"
study="gnomad_mt"
study_name="gnomAD v3.1 Mitocondrial DNA Variants"
study_path="data/"$study
folder_path="/home/gnomad_mt"
mapping_file="mapping_file_gnomad_mt_mod_file.txt"
vcf_file="gnomad.genomes.v3.1.sites.chrM.mod.vcf.gz"
mapping_file_path=$folder_path$mapping_file
vcf_file_path=$folder_path$vcf_file

# Login
/home/opencga-client-2.12.0/bin/opencga.sh login $user --host $host

# Project creation
/home/opencga-client-2.12.0/bin/opencga.sh projects create --id $project --name $project_name --organism-scientific-name hsapiens --organism-assembly grch38 --host $host

# Study creation
/home/opencga-client-2.12.0/bin/opencga.sh studies create --id $study --name $study_name --project $project --host $host

# Folders creation within Catalog
/home/opencga-client-2.12.0/bin/opencga.sh files create --path $study_path --parents --study $study --type DIRECTORY --host $host

# Uploading gnomad mt variants VCF and mapping file for gnomad mt variants
/home/opencga-client-2.12.0/bin/opencga.sh files upload -i $mapping_file_path --path $study_path --study $study --host $host

/home/opencga-client-2.12.0/bin/opencga.sh files upload -i $vcf_file_path --path $study_path --study $study --host $host

# Variant index for gnomad mt variants VCF
/home/opencga-client-2.12.0/bin/opencga.sh operations variant-index --study $study --file $vcf_file --load-archive NO --load-split-data CHROMOSOME --host $host

# Variant stats index for gnomad mt variants. The corresponding cohorts and variant cohort stats will be generated using the information of interest provided in the mapping file and INFO column of the gnomad mt VCF
/home/opencga-client-2.12.0/bin/opencga.sh operations variant-stats-index --study $study --aggregation-mapping-file $mapping_file --aggregated BASIC --host $host

# Variant cohort stats will be converted to population frequencies data model (julie-tool)
/home/opencga-client-2.12.0/bin/opencga.sh operations variant-julie-run --project $project --host $host

# Export of annotation.populationFrequencies in json format
/home/opencga-client-2.12.0/bin/opencga.sh variant export-run --body_include annotation.populationFrequencies --body_project $project --project $project --output-file-format json --host $host
57 changes: 57 additions & 0 deletions cellbase-app/app/scripts/gnomad_mt_prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3

# Copyright 2015-2020 OpenCB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import requests
import sys
import json
import pathlib
from pathlib import Path


## Configure command-line options
parser = argparse.ArgumentParser()
parser.add_argument('-i', help="VCF file", required=True)


## Parse command-line parameters and init basedir, tag and build_folder
args = parser.parse_args()
print(args.i)

if os.path.isfile(args.i) == False:
print("no existe")


# Opening file
vcf_file = open(args.i, 'r')
count = 0

# Using for loop
print("Using for loop")
for line in vcf_file:
count += 1
if not line.startswith("#"):
line = line.strip()
cols = line.split("\t")
print(line)
info_cols = cols[7].split(";")
var = [x for x in info_cols if x.startswith("AN=")]
print("{}".format(var))


# Closing files
vcf_file.close()
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ public class DownloadProperties {
private EnsemblProperties ensembl;
private EnsemblProperties ensemblGenomes;
private URLProperties hgnc;
private URLProperties cancerHotspot;
private URLProperties refSeq;
private URLProperties refSeqFasta;
private URLProperties refSeqProteinFasta;
Expand Down Expand Up @@ -71,6 +72,7 @@ public class DownloadProperties {
private URLProperties hpoObo;
private URLProperties goObo;
private URLProperties doidObo;
private URLProperties mondoObo;
private URLProperties goAnnotation;
private URLProperties revel;
private URLProperties pubmed;
Expand Down Expand Up @@ -527,6 +529,24 @@ public DownloadProperties setHgnc(URLProperties hgnc) {
return this;
}

public URLProperties getCancerHotspot() {
return cancerHotspot;
}

public DownloadProperties setCancerHotspot(URLProperties cancerHotspot) {
this.cancerHotspot = cancerHotspot;
return this;
}

public URLProperties getMondoObo() {
return mondoObo;
}

public DownloadProperties setMondoObo(URLProperties mondoObo) {
this.mondoObo = mondoObo;
return this;
}

public static class EnsemblProperties {

private DatabaseCredentials database;
Expand Down
Loading

0 comments on commit bb51d33

Please sign in to comment.