NCBI C++ ToolKit
snp_bins.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*  $Id: snp_bins.cpp 66135 2015-02-05 14:26:08Z rudnev $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Melvin Quintos, Dmitry Rudnev
00027  *
00028  * File Description:
00029  *  Implements the functions in snp_bins.hpp
00030  */
00031 
00032 #include <ncbi_pch.hpp>
00033 
00034 #include <objmgr/annot_ci.hpp>
00035 #include <objmgr/feat_ci.hpp>
00036 #include <objmgr/graph_ci.hpp>
00037 #include <objmgr/align_ci.hpp>
00038 #include <objmgr/table_field.hpp>
00039 
00040 #include <objects/seqalign/Seq_align_set.hpp>
00041 #include <objects/seqalign/Spliced_seg.hpp>
00042 #include <objects/general/Object_id.hpp>
00043 #include <objects/seqres/Byte_graph.hpp>
00044 
00045 #include <objtools/snputil/snp_bins.hpp>
00046 
00047 #include <objects/seq/Annot_descr.hpp>
00048 #include <objects/seq/Annotdesc.hpp>
00049 
00050 #include <util/checksum.hpp>
00051 
00052 #include <cmath>
00053 
00054 #include <objtools/snputil/snp_bins.hpp>
00055 #include <objtools/snputil/snp_utils.hpp>
00056 
00057 BEGIN_NCBI_SCOPE
00058 USING_SCOPE(objects);
00059 
00060 
00061 string NSnpBins::SourceAsString(TSource Source)
00062 {
00063     switch(Source)
00064     {
00065         case eSource_dbGAP:
00066             return "dbGaP";
00067         case eSource_NHGRI_GWAS:
00068             return "NHGRI GWAS catalog";
00069         case eSource_NHLBI_GRASP:
00070             return "NHLBI GRASP";
00071     }
00072     return "dbGaP";
00073 }
00074 
00075 void NSnpBins::ReadAnnotDesc(const CSeq_annot_Handle& handle, string& title, string& comment)
00076 {
00077     if (handle.Seq_annot_CanGetDesc()) {
00078         // Extract the Annotations properties
00079         ITERATE( CAnnot_descr::Tdata, it, handle.Seq_annot_GetDesc().Get()) {
00080             const CAnnotdesc &desc = **it;
00081 
00082             if (desc.IsComment()) {
00083                 comment = desc.GetComment();
00084             }
00085             else if (desc.IsTitle()) {
00086                 title = desc.GetTitle();
00087             }
00088         }
00089     }
00090 }
00091 
00092 
00093 // get a selector for a bin given a NA track accession with some selector parameters
00094 void NSnpBins::GetBinSelector(const string& sTrackAccession,
00095     bool isAdaptive,
00096     int depth,
00097     SAnnotSelector& sel)
00098 {
00099     sel.SetOverlapTotalRange().SetResolveAll();
00100     sel.SetAnnotType(CSeq_annot::TData::e_Seq_table);
00101     sel.IncludeNamedAnnotAccession(sTrackAccession);
00102     sel.AddNamedAnnots(sTrackAccession);
00103     // copied from  CSeqUtils::SetResolveDepth()
00104     if(isAdaptive) {
00105         sel.SetAdaptiveDepth(true);
00106         sel.SetExactDepth(false);
00107         //!!: watch out
00108         // Maybe there is bug inside selector, we have to call SetResolveAll() even
00109         // for cases where we only want to resolve up to a given depth.
00110         sel.SetResolveAll();
00111         if (depth >=0) sel.SetResolveDepth(depth);
00112     } else if (depth >= 0){
00113         sel.SetResolveDepth(depth);
00114         sel.SetExactDepth(true);
00115         sel.SetAdaptiveDepth(false);
00116     }
00117 }
00118 
00119 // get an annotation handle that is needed to load a bin from an existing selector and loc and bioseq handle
00120 // returns false if a handle cannot be obtained
00121 bool NSnpBins::GetBinHandle(CScope& scope,
00122     const SAnnotSelector& sel,
00123     const CSeq_loc &loc,
00124     CSeq_annot_Handle& handle)
00125 {
00126     CAnnot_CI iter(scope, loc, sel);
00127     if(iter.size() != 1) {
00128         return false;
00129     }
00130     handle = *iter;
00131     return true;
00132 }
00133 
00134 
00135 
00136 
00137 // choose a more significant entry of the two offered
00138 // returns 1 of entry1 is more significant or 2 if entry2 is more
00139 int NSnpBins::ChooseSignificant(const SBinEntry* entry1, const SBinEntry* entry2, TBinType type)
00140 {
00141     // significance is determined using different metrics depending on the bin type
00142     // for eCLIN, the most significant is pathogenic, then probably pathogenic, then everything else
00143     // for all other bins, the significance is determined by the largest pvalue
00144     if(type == eCLIN) {
00145         return (entry1->ClinSigID == CPhenotype::eClinical_significance_pathogenic ||
00146                                 (entry1->ClinSigID == CPhenotype::eClinical_significance_probable_pathogenic && entry2->ClinSigID != CPhenotype::eClinical_significance_pathogenic))
00147                             ?   1
00148                             :   2;
00149     } else {
00150         return (entry1->pvalue > entry2->pvalue)
00151                                 ? 1
00152                                 : 2;
00153     }
00154 }
00155 
00156 
00157 CRef<NSnpBins::SBin> NSnpBins::GetBin(const objects::CSeq_annot_Handle& annot,
00158                                   TSeqRange range)
00159 {
00160     const CTableFieldHandle<int>      col_type("trackType");
00161     int     pos_start, pos_end;
00162     int type;
00163     string title, comment;
00164     CRef<SBin>  res(new SBin);
00165     FindPosIndexRange(annot, (int)range.GetFrom(), (int)range.GetTo(), pos_start, pos_end);
00166     ReadAnnotDesc(annot, title, comment);
00167     if (!col_type.TryGet(annot, 0, type)) {
00168         type = NSnpBins::eGAP;
00169     }
00170     res->count = 0;
00171     res->range = range;
00172     res->title = title;
00173     res->type = type;
00174 
00175     for(int row = pos_start; row < pos_end; ++row ) {
00176         CRef<NSnpBins::SBinEntry> BinEntry(GetEntry(annot, row));
00177         if(res->m_SigEntry.Empty()) {
00178             res->m_SigEntry = BinEntry;
00179         } else {
00180             if(ChooseSignificant(res->m_SigEntry, BinEntry, type) == 2) {
00181                 res->m_SigEntry = BinEntry;
00182             }
00183         }
00184         res->m_EntryList.push_back(BinEntry);
00185         res->count++;
00186     }
00187     return res;
00188 }
00189 
00190 CRef<NSnpBins::SBinEntry> NSnpBins::GetEntry(const objects::CSeq_annot_Handle& annot,
00191                                              int row)
00192 {
00193     const CTableFieldHandle<int>      col_pos("pos");
00194     const CTableFieldHandle<int>      col_pos_end("pos_end");
00195     const CTableFieldHandle<double>   col_val("pvalue");
00196     const CTableFieldHandle<double>   col_val_synonym("Pvalue");
00197     const CTableFieldHandle<string>   col_trait("trait");
00198     const CTableFieldHandle<string>   col_pmids("pmids");
00199     const CTableFieldHandle<string>   col_rgenes("reportedGenes");
00200     const CTableFieldHandle<string>   col_mgenes("mappedGenes");
00201     const CTableFieldHandle<int>      col_snpid("snpId");
00202     const CTableFieldHandle<string>   col_sub_type("trackSubType");
00203     const CTableFieldHandle<int>      col_clinsigid("clinSigID");
00204     const CTableFieldHandle<string>   col_hgvs("HGVS");
00205     const CTableFieldHandle<string>   col_dbgaptext("dbgaptext");
00206     const CTableFieldHandle<string>   col_context("context");
00207     const CTableFieldHandle<int>      col_source("source");
00208     const CTableFieldHandle<string>   col_population("population");
00209     const CTableFieldHandle<int>      col_geneId("geneId");
00210     const CTableFieldHandle<string>   col_geneName("geneName");
00211 
00212     string  trackSubType;
00213     int pos, pos_end;
00214     int snpid, ClinSigID;
00215     double  pvalue;
00216     string  trait, pmids, rgenes, mgenes;
00217     string  title, comment, population;
00218     string  sHGVS;
00219     int     source, geneId;
00220     string  dbgaptext, geneName;
00221     string  context;
00222 
00223     CRef<NSnpBins::SBinEntry> entry;
00224     if(col_pos.TryGet(annot, row, pos)) {
00225         entry.Reset(new NSnpBins::SBinEntry());
00226         entry->pos = (TSeqPos)pos;
00227         entry->pos_end  = col_pos_end.TryGet(annot, row, pos_end) ? (TSeqPos)pos_end : kInvalidSeqPos;
00228         entry->trackSubType = col_sub_type.TryGet(annot, row, trackSubType) ? trackSubType : "";
00229         entry->snpid  = col_snpid.TryGet(annot, row, snpid) ? (unsigned int)snpid : 0;
00230         entry->pvalue = (col_val.TryGet(annot, row, pvalue) || col_val_synonym.TryGet(annot, row, pvalue)) ? -log10(pvalue) : 0;
00231         entry->trait = col_trait.TryGet(annot, row, trait) ? trait : "";
00232         entry->pmids  = col_pmids.TryGet(annot, row, pmids) ? pmids : "";
00233         entry->genes_reported = col_rgenes.TryGet(annot, row, rgenes) ? rgenes : "";
00234         entry->genes_mapped = col_mgenes.TryGet(annot, row, mgenes) ? mgenes : "";
00235         entry->ClinSigID = col_clinsigid.TryGet(annot, row, ClinSigID) ? ClinSigID : -1;
00236         entry->sHGVS = col_hgvs.TryGet(annot, row, sHGVS) ? sHGVS : "";
00237         entry->dbgaptext = col_dbgaptext.TryGet(annot, row, dbgaptext) ? dbgaptext : "";
00238         entry->context = col_context.TryGet(annot, row, context) ? context : "";
00239         entry->source = col_source.TryGet(annot, row, source) ? source : -1;
00240         entry->population = col_population.TryGet(annot, row, population) ? population : "";
00241         entry->geneName = col_geneName.TryGet(annot, row, geneName) ? geneName : "";
00242         entry->geneId = col_geneId.TryGet(annot, row, geneId) ? geneId : -1;
00243     }
00244     return entry;
00245 }
00246 
00247 void NSnpBins::CGeneMap::x_Init(const string& sSrc)
00248 {
00249     m_GeneMap.clear();
00250     list<string> GeneSymIDPairsList;
00251 
00252     NStr::Split(sSrc, ":", GeneSymIDPairsList);
00253 
00254     ITERATE(list<string>, iGeneSymIDPairsList, GeneSymIDPairsList) {
00255         list<string> GeneSymIDPair;
00256 
00257         NStr::Split(*iGeneSymIDPairsList, "^", GeneSymIDPair);
00258 
00259         m_GeneMap[GeneSymIDPair.front()] = GeneSymIDPair.size() == 2 ? GeneSymIDPair.back() : string();
00260     }
00261 }
00262 
00263 string NSnpBins::CGeneMap::AsString() const
00264 {
00265     string sRes;
00266 
00267     ITERATE(TGeneMap, iGeneMap, m_GeneMap) {
00268         sRes += (sRes.empty() ? "" : ":") + iGeneMap->first + "^" + iGeneMap->second;
00269     }
00270 
00271     return sRes;
00272 }
00273 
00274 void NSnpBins::FindPosIndexRange(const CSeq_annot_Handle& annot,
00275                          int pos_value_from, int pos_value_to,
00276                           int& pos_index_begin, int& pos_index_end)
00277 {
00278     size_t rows = annot.GetSeq_tableNumRows();
00279     const CTableFieldHandle<int> col_pos("pos");
00280     const CTableFieldHandle<string>   col_sub_type("trackSubType");
00281 
00282     pos_index_begin = -1;
00283     pos_index_end   = rows-1;
00284 
00285     string  trackSubType;
00286     col_sub_type.TryGet(annot, 0, trackSubType);
00287 
00288     const CTableFieldHandle<int>  col_pos_end(NSnpBins::isGeneMarker(trackSubType) ? "pos_end" : "pos");
00289 
00290 
00291     // Find 'pos_value_from'
00292     int lower_pos_index_bound = 0;
00293     int upper_pos_index_bound = rows-1;
00294     int pos_index_k = 0;
00295     int pos_value_k = 0;
00296     do {
00297         pos_index_k = (lower_pos_index_bound + upper_pos_index_bound)/2;
00298         col_pos_end.TryGet(annot, pos_index_k, pos_value_k);
00299         if(pos_value_k < pos_value_from) {
00300             pos_index_begin = pos_index_k;
00301             lower_pos_index_bound = pos_index_k+1;
00302         } else {
00303             upper_pos_index_bound = pos_index_k-1;
00304         }
00305     } while (pos_value_k != pos_value_from && lower_pos_index_bound <= upper_pos_index_bound);
00306 
00307     // position start to be inclusive (catch boundary condition)
00308     pos_index_begin = (pos_value_from == pos_value_k ? pos_index_k : pos_index_begin+1);
00309 
00310     // slide the start down for cases when there are several entries with the same position
00311     int SlidingBegin(pos_index_begin-1);
00312     while(SlidingBegin >= 0) {
00313         col_pos_end.TryGet(annot, SlidingBegin, pos_value_k);
00314         if(pos_value_k < pos_value_from)
00315             break;
00316         pos_index_begin = SlidingBegin;
00317         --SlidingBegin;
00318     }
00319 
00320     // find the 'pos_value_to' value
00321     lower_pos_index_bound=0;
00322     upper_pos_index_bound=rows-1;
00323     pos_value_k = 0;
00324     do {
00325         pos_index_k = (lower_pos_index_bound+upper_pos_index_bound)/2;
00326         col_pos.TryGet(annot, pos_index_k, pos_value_k);
00327         if (pos_value_k < pos_value_to) {
00328             lower_pos_index_bound = pos_index_k+1;
00329         }
00330         else {
00331             pos_index_end = pos_index_k;
00332             upper_pos_index_bound = pos_index_k-1;
00333         }
00334     } while (pos_value_k != pos_value_to && lower_pos_index_bound <= upper_pos_index_bound);
00335 
00336     // increase end to include in range up until the latest entry with "pos".
00337     size_t SlidingEnd(pos_index_end);
00338     while(SlidingEnd < rows) {
00339         col_pos.TryGet(annot, SlidingEnd, pos_value_k);
00340         if(pos_value_k > pos_value_to)
00341             break;
00342         pos_index_end = SlidingEnd+1;
00343         ++SlidingEnd;
00344     }
00345 }
00346 
00347 
00348 
00349 
00350 END_NCBI_SCOPE
00351 
Modified on Mon May 25 11:35:40 2015 by modify_doxy.py rev. 426318