From a2797c508b885d8f7af00d389d4299daf1feed13 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Mon, 26 Feb 2024 11:17:59 -0600 Subject: [PATCH] Retrieve DRB columns from `locus_column_mapping` rather than guessing column names. --- extras/reduce_conf.json | 57 ++++++++++++++++++++++++++++++---------- extras/sample.csv | 8 +++--- scripts/pyard-reduce-csv | 24 ++++++++++------- 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/extras/reduce_conf.json b/extras/reduce_conf.json index 1a316a3..8e83d32 100644 --- a/extras/reduce_conf.json +++ b/extras/reduce_conf.json @@ -14,6 +14,12 @@ "r_drb1_typ2", "r_dpb1_typ1", "r_dpb1_typ2", + "r_drb3_typ1", + "r_drb3_typ2", + "r_drb4_typ1", + "r_drb4_typ2", + "r_drb5_typ1", + "r_drb5_typ2", "d_a_typ1", "d_a_typ2", "d_b_typ1", @@ -23,7 +29,13 @@ "d_drb1_typ1", "d_drb1_typ2", "d_dpb1_typ1", - "d_dpb1_typ2" + "d_dpb1_typ2", + "d_drb3_typ1", + "d_drb3_typ2", + "d_drb4_typ1", + "d_drb4_typ2", + "d_drb5_typ1", + "d_drb5_typ2" ], "locus_column_mapping": { "recipient": { @@ -39,13 +51,25 @@ "r_c_typ1", "r_c_typ2" ], + "dqb1": [ + "r_dpb1_typ1", + "r_dpb1_typ2" + ], "drb1": [ "r_drb1_typ1", "r_drb1_typ2" ], - "dqb1": [ - "r_dpb1_typ1", - "r_dpb1_typ2" + "drb3": [ + "r_drb3_typ1", + "r_drb3_typ2" + ], + "drb4": [ + "r_drb4_typ1", + "r_drb4_typ2" + ], + "drb5": [ + "r_drb5_typ1", + "r_drb5_typ2" ] }, "donor": { @@ -61,20 +85,30 @@ "d_c_typ1", "d_c_typ2" ], + "dqb1": [ + "d_dpb1_typ1", + "d_dpb1_typ2" + ], "drb1": [ "d_drb1_typ1", "d_drb1_typ2" ], - "dqb1": [ - "d_dpb1_typ1", - "d_dpb1_typ2" + "drb3": [ + "d_drb3_typ1", + "d_drb3_typ2" + ], + "drb4": [ + "d_drb4_typ1", + "d_drb4_typ2" + ], + "drb5": [ + "d_drb5_typ1", + "d_drb5_typ2" ] } }, - "redux_type": "lgx", "redux_cache_size": 1000, - "reduce_serology": false, "reduce_v2": true, "convert_v2_to_v3": false, @@ -84,17 +118,12 @@ "reduce_XX": false, "reduce_MAC": true, "map_drb345_to_drbx": false, - "locus_in_allele_name": true, "keep_locus_in_allele_name": true, - "new_column_for_redux": true, "reduced_column_prefix": "reduced_", - "generate_glstring": true, - "output_file_format": "csv", "apply_compression": "gzip", - "verbose_log": true } diff --git a/extras/sample.csv b/extras/sample.csv index 0e21ca2..75d3f3f 100644 --- a/extras/sample.csv +++ b/extras/sample.csv @@ -1,4 +1,4 @@ -rid,did,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2,d_a_typ1,d_a_typ2,d_b_typ1,d_b_typ2,d_c_typ1,d_c_typ2,d_drb1_typ1,d_drb1_typ2,d_dpb1_typ1,d_dpb1_typ2 -2110,123,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01 -2111,456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01 -2113,789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01 +rid,did,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2,d_a_typ1,d_a_typ2,d_b_typ1,d_b_typ2,d_c_typ1,d_c_typ2,d_drb1_typ1,d_drb1_typ2,d_dpb1_typ1,d_dpb1_typ2,r_drb3_typ1,r_drb3_typ2,r_drb4_typ1,r_drb4_typ2,r_drb5_typ1,r_drb5_typ2,d_drb3_typ1,d_drb3_typ2,d_drb4_typ1,d_drb4_typ2,d_drb5_typ1,d_drb5_typ2 +2110,123,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,DRB3*02:189,DRB3*03:09,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,DRB5*01:93,DRB5*02:02:01 +2111,456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,NNNN,NNNN,DRB4*01:53,DRB4*01:31,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,DRB5*01:102,DRB5*01:103 +2113,789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,NNNN,NNNN,DRB4*01:79,DRB4*01:119,NNNN,NNNN,DRB3*02:189,DRB3*03:09,NNNN,NNNN,NNNN,NNNN diff --git a/scripts/pyard-reduce-csv b/scripts/pyard-reduce-csv index b432659..6558fda 100755 --- a/scripts/pyard-reduce-csv +++ b/scripts/pyard-reduce-csv @@ -209,15 +209,21 @@ def reduce_locus_columns(df, ard_config, locus_column_mapping, verbose): # New columns DRBX_1 and DRBX_2 are created if ard_config.get("map_drb345_to_drbx"): drbx_loci = ["DRB3", "DRB4", "DRB5"] - drbx_columns = [ - col_name for col_name in df.columns if col_name.split("_")[1] in drbx_loci - ] - if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2 - locus_in_allele_name = ard_config["keep_locus_in_allele_name"] - df_drbx = df[drbx_columns].apply( - create_drbx, axis=1, args=(locus_in_allele_name,) - ) - df["DRBX_1"], df["DRBX_2"] = zip(*df_drbx) + for subject in ard_config["locus_column_mapping"].keys(): + subject_loci = ard_config["locus_column_mapping"][subject] + subject_drbs = [] + for locus in ard_config["locus_column_mapping"][subject].keys(): + if locus.upper() in drbx_loci: + subject_drbs.extend(subject_loci[locus]) + + # If all the DRBs are there + # ['DRB3_1', 'DRB3_2', 'DRB4_1', 'DRB4_2', 'DRB5_1', 'DRB5_2'] + if len(subject_drbs) == 6: + locus_in_allele_name = ard_config["keep_locus_in_allele_name"] + df_drbx = df[subject_drbs].apply( + create_drbx, axis=1, args=(locus_in_allele_name,) + ) + df[f"{subject}_DRBX_1"], df[f"{subject}_DRBX_2"] = zip(*df_drbx) if ard_config.get("generate_glstring"): for subject in locus_column_mapping: