NCBI C++ ToolKit
cn3d_pssm.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*  $Id: cn3d_pssm.cpp 44550 2010-01-21 20:28:38Z thiessen $
00002 * ===========================================================================
00003 *
00004 *                            PUBLIC DOMAIN NOTICE
00005 *               National Center for Biotechnology Information
00006 *
00007 *  This software/database is a "United States Government Work" under the
00008 *  terms of the United States Copyright Act.  It was written as part of
00009 *  the author's official duties as a United States Government employee and
00010 *  thus cannot be copyrighted.  This software/database is freely available
00011 *  to the public for use. The National Library of Medicine and the U.S.
00012 *  Government have not placed any restriction on its use or reproduction.
00013 *
00014 *  Although all reasonable efforts have been taken to ensure the accuracy
00015 *  and reliability of the software and data, the NLM and the U.S.
00016 *  Government do not and cannot warrant the performance or results that
00017 *  may be obtained by using this software or data. The NLM and the U.S.
00018 *  Government disclaim all warranties, express or implied, including
00019 *  warranties of performance, merchantability or fitness for any particular
00020 *  purpose.
00021 *
00022 *  Please cite the author in any work or product based on this material.
00023 *
00024 * ===========================================================================
00025 *
00026 * Authors:  Paul Thiessen
00027 *
00028 * File Description:
00029 *      new C++ PSSM construction
00030 *
00031 * ===========================================================================
00032 */
00033 
00034 #include <ncbi_pch.hpp>
00035 #include <corelib/ncbistd.hpp>
00036 #include <corelib/ncbistre.hpp>
00037 #include <serial/serial.hpp>
00038 #include <serial/objostrasn.hpp>
00039 
00040 #include <objects/scoremat/scoremat__.hpp>
00041 
00042 #include <algo/structure/cd_utils/cuCdCore.hpp>
00043 #include <algo/structure/cd_utils/cuPssmMaker.hpp>
00044 
00045 #include "remove_header_conflicts.hpp"
00046 
00047 #include "cn3d_pssm.hpp"
00048 #include "block_multiple_alignment.hpp"
00049 #include "sequence_set.hpp"
00050 #include "cn3d_tools.hpp"
00051 
00052 USING_NCBI_SCOPE;
00053 USING_SCOPE(objects);
00054 USING_SCOPE(blast);
00055 
00056 
00057 BEGIN_SCOPE(Cn3D)
00058 
00059 //#define DEBUG_PSSM 1 // for testing/debugging PSSM data
00060 
00061 #define PTHROW(stream) NCBI_THROW(CException, eUnknown, stream)
00062 
00063 PSSMWrapper::PSSMWrapper(const BlockMultipleAlignment *bma) : multiple(bma)
00064 {
00065 #ifdef DEBUG_PSSM
00066     {{
00067         CNcbiOfstream ofs("pssm.txt", IOS_BASE::out);
00068     }}
00069 #endif
00070 
00071     try {
00072         TRACEMSG("Creating PSSM...");
00073 
00074         // construct a "fake" CD to pass to PssmMaker
00075         cd_utils::CCdCore c;
00076         c.SetName("fake");
00077 
00078         // construct Seq-entry from sequences in current alignment
00079         CRef < CSeq_entry > seq(new CSeq_entry);
00080         seq->SetSeq().Assign(multiple->GetMaster()->bioseqASN.GetObject());
00081         c.SetSequences().SetSet().SetSeq_set().push_back(seq);
00082 
00083         // construct Seq-annot from rows in the alignment
00084         c.SetSeqannot().push_back(CRef<CSeq_annot>(new CSeq_annot));
00085         BlockMultipleAlignment::UngappedAlignedBlockList blocks;
00086         bma->GetUngappedAlignedBlocks(&blocks);
00087 
00088         // fill out Seq-entry and Seq-annot based on BMA (row order is irrelevant here)
00089         for (unsigned int i=((bma->NRows() > 1) ? 1 : 0); i<bma->NRows(); ++i) {
00090             seq.Reset(new CSeq_entry);
00091             seq->SetSeq().Assign(multiple->GetSequenceOfRow(i)->bioseqASN.GetObject());
00092             c.SetSequences().SetSet().SetSeq_set().push_back(seq);
00093             CRef < CSeq_align > seqAlign(CreatePairwiseSeqAlignFromMultipleRow(bma, blocks, i));
00094             c.SetSeqannot().front()->SetData().SetAlign().push_back(seqAlign);
00095         }
00096 
00097         // use PssmMaker to create PSSM using consensus
00098         cd_utils::PssmMaker pm(&c, true, true);
00099         cd_utils::PssmMakerOptions options;     // comes with defaults
00100         options.requestFrequencyRatios = true;  // necessary for psi-blast
00101 //        options.scalingFactor = 100;          // do *NOT* use SF other than 1 for psi-blast
00102         pm.setOptions(options);
00103         pssm = pm.make();
00104 
00105         // blast functions require a master (query) sequence to be present; give it a recognizable id
00106         if (!pssm->GetPssm().IsSetQuery() || !pssm->GetPssm().GetQuery().IsSeq())
00107             PTHROW("PssmWithParameters from cd_utils::PssmMaker() doesn't contain the master/query sequence");
00108         CRef < CSeq_id > id(new CSeq_id);
00109         id->SetLocal().SetStr("consensus");
00110         pssm->SetPssm().SetQuery().SetSeq().SetId().push_front(id);
00111 
00112         // for efficient score lookup
00113         UnpackMatrix(pm);
00114 
00115 #ifdef DEBUG_PSSM
00116         CNcbiOfstream ofs("pssm.txt", IOS_BASE::out | IOS_BASE::app);
00117         if (ofs) {
00118             CObjectOStreamAsn oosa(ofs, false);
00119             oosa << *pssm;
00120 
00121 /*
00122             if (pssm->GetPssm().IsSetIntermediateData() && pssm->GetPssm().GetIntermediateData().IsSetResFreqsPerPos()) {
00123                 vector < int >
00124                     freqs(pssm->GetPssm().GetIntermediateData().GetResFreqsPerPos().size()),
00125                     nNonGap(pssm->GetPssm().GetNumColumns(), 0);
00126                 unsigned int i;
00127                 CPssmIntermediateData::TResFreqsPerPos::const_iterator
00128                     l = pssm->GetPssm().GetIntermediateData().GetResFreqsPerPos().begin();
00129                 for (i=0; i<pssm->GetPssm().GetIntermediateData().GetResFreqsPerPos().size(); ++i, ++l)
00130                     freqs[i] = *l;
00131                 int freq, n;
00132                 ofs << "observed frequencies:\n";
00133                 for (unsigned int c=0; c<pssm->GetPssm().GetNumColumns(); ++c) {
00134                     ofs << "column " << (c+1) << ": ";
00135                     n = 0;
00136                     for (unsigned int r=0; r<pssm->GetPssm().GetNumRows(); ++r) {
00137                         if (pssm->GetPssm().GetByRow())
00138                             freq = freqs[r * pssm->GetPssm().GetNumColumns() + c];
00139                         else
00140                             freq = freqs[c * pssm->GetPssm().GetNumRows() + r];
00141                         if (freq > 0) {
00142                             ofs << LookupCharacterFromNCBIStdaaNumber(r) << '(' << freq << ") ";
00143                             n += freq;
00144                             if (r != 0)
00145                                 nNonGap[c] += freq;
00146                         }
00147                     }
00148                     ofs << "total: " << n << " non-gap: " << nNonGap[c] << '\n';
00149                 }
00150 
00151                 if (pssm->GetPssm().IsSetIntermediateData() && pssm->GetPssm().GetIntermediateData().IsSetWeightedResFreqsPerPos()) {
00152                     vector < double > wfreqs(pssm->GetPssm().GetIntermediateData().GetWeightedResFreqsPerPos().size());
00153                     CPssmIntermediateData::TWeightedResFreqsPerPos::const_iterator
00154                         m = pssm->GetPssm().GetIntermediateData().GetWeightedResFreqsPerPos().begin();
00155                     for (i=0; i<pssm->GetPssm().GetIntermediateData().GetWeightedResFreqsPerPos().size(); ++i, ++m)
00156                         wfreqs[i] = *m;
00157                     double wfreq, s;
00158                     ofs << "weighted frequencies:\n";
00159                     for (unsigned int c=0; c<pssm->GetPssm().GetNumColumns(); ++c) {
00160                         ofs << "column " << (c+1) << ": ";
00161                         s = 0.0;
00162                         for (unsigned int r=0; r<pssm->GetPssm().GetNumRows(); ++r) {
00163                             if (pssm->GetPssm().GetByRow())
00164                                 wfreq = wfreqs[r * pssm->GetPssm().GetNumColumns() + c];
00165                             else
00166                                 wfreq = wfreqs[c * pssm->GetPssm().GetNumRows() + r];
00167                             if (wfreq != 0.0) {
00168                                 ofs << LookupCharacterFromNCBIStdaaNumber(r) << '(' << wfreq << ") ";
00169                                 s += wfreq;
00170                             }
00171                         }
00172                         ofs << "sum: " << s << '\n';
00173                     }
00174                 }
00175 
00176                 if (pssm->GetPssm().IsSetIntermediateData() && pssm->GetPssm().GetIntermediateData().IsSetFreqRatios()) {
00177                     vector < double > ratios(pssm->GetPssm().GetIntermediateData().GetFreqRatios().size());
00178                     CPssmIntermediateData::TFreqRatios::const_iterator
00179                         n = pssm->GetPssm().GetIntermediateData().GetFreqRatios().begin();
00180                     for (i=0; i<pssm->GetPssm().GetIntermediateData().GetFreqRatios().size(); ++i, ++n)
00181                         ratios[i] = *n;
00182                     double ratio, s;
00183                     ofs << "frequency ratios:\n";
00184                     for (unsigned int c=0; c<pssm->GetPssm().GetNumColumns(); ++c) {
00185                         ofs << "column " << (c+1) << ": ";
00186                         s = 0.0;
00187                         for (unsigned int r=0; r<pssm->GetPssm().GetNumRows(); ++r) {
00188                             if (pssm->GetPssm().GetByRow())
00189                                 ratio = ratios[r * pssm->GetPssm().GetNumColumns() + c];
00190                             else
00191                                 ratio = ratios[c * pssm->GetPssm().GetNumRows() + r];
00192                             if (ratio != 0.0) {
00193                                 ofs << LookupCharacterFromNCBIStdaaNumber(r) << '(' << ratio << ") ";
00194                                 s += ratio;
00195                             }
00196                         }
00197                         ofs << "sum: " << s << '\n';
00198                     }
00199                 }
00200             }
00201 */
00202         }
00203 #endif
00204 
00205     } catch (exception& e) {
00206         ERRORMSG("PSSMWrapper::PSSMWrapper() failed with exception: " << e.what());
00207     } catch (...) {
00208         ERRORMSG("PSSMWrapper::PSSMWrapper() failed with unknown exception");
00209     }
00210 }
00211 
00212 void PSSMWrapper::UnpackMatrix(ncbi::cd_utils::PssmMaker& pm)
00213 {
00214     if (!pssm->GetPssm().IsSetFinalData())
00215         PTHROW("UnpackMatrix() - pssm must have finalData");
00216     unsigned int nScores = pssm->GetPssm().GetNumRows() * pssm->GetPssm().GetNumColumns();
00217     if (pssm->GetPssm().GetNumRows() != 28 || pssm->GetPssm().GetFinalData().GetScores().size() != nScores)
00218         PTHROW("UnpackMatrix() - bad matrix size");
00219 
00220     scalingFactor = pssm->GetPssm().GetFinalData().GetScalingFactor();
00221 
00222     // allocate matrix
00223     unsigned int i;
00224     scaledMatrix.resize(pssm->GetPssm().GetNumColumns());
00225     for (i=0; (int)i<pssm->GetPssm().GetNumColumns(); ++i)
00226         scaledMatrix[i].resize(28);
00227 
00228     // convert matrix
00229     unsigned int r = 0, c = 0;
00230     CPssmFinalData::TScores::const_iterator s = pssm->GetPssm().GetFinalData().GetScores().begin();
00231     for (i=0; i<nScores; ++i, ++s) {
00232 
00233         scaledMatrix[c][r] = *s;
00234 
00235         // adjust for matrix layout in pssm
00236         if (pssm->GetPssm().GetByRow()) {
00237             ++c;
00238             if ((int)c == pssm->GetPssm().GetNumColumns()) {
00239                 ++r;
00240                 c = 0;
00241             }
00242         } else {
00243             ++r;
00244             if ((int)r == pssm->GetPssm().GetNumRows()) {
00245                 ++c;
00246                 r = 0;
00247             }
00248         }
00249     }
00250 
00251     // map multiple's master <-> consensus position
00252     if ((int)pm.getConsensus().size() != pssm->GetPssm().GetNumColumns())
00253         PTHROW("Consensus sequence does not match PSSM size");
00254     TRACEMSG("master length: " << multiple->GetMaster()->Length() << ", consensus length: " << pm.getConsensus().size());
00255     cd_utils::BlockModelPair bmp(pm.getGuideAlignment());   // consensus is dependent
00256     consensus2master.resize(pm.getConsensus().size());
00257     for (i=0; i<pm.getConsensus().size(); ++i)
00258         consensus2master[i] = bmp.mapToMaster(i);
00259     bmp.reverse();  // so that master is consensus, dependent is multiple's master
00260     master2consensus.resize(multiple->GetMaster()->Length());
00261     for (i=0; i<multiple->GetMaster()->Length(); ++i)
00262         master2consensus[i] = bmp.mapToMaster(i);
00263 }
00264 
00265 void PSSMWrapper::OutputPSSM(ncbi::CNcbiOstream& os, const string& title) const
00266 {
00267     // create a copy of the pssm, massaged a bit so that it'll work correctly with psi-blast, rps-blast
00268     CPssmWithParameters copy;
00269     copy.Assign(*pssm);
00270     if (!copy.GetPssm().IsSetQuery() || !copy.GetPssm().GetQuery().IsSeq()) {
00271         ERRORMSG("PssmWithParameters from cd_utils::PssmMaker() doesn't contain the master/query sequence");
00272         return;
00273     }
00274 
00275     CBioseq::TId keep;
00276     CBioseq::TId::iterator i, ie = copy.SetPssm().SetQuery().SetSeq().SetId().end();
00277     for (i=copy.SetPssm().SetQuery().SetSeq().SetId().begin(); i!=ie; ++i) {
00278         if ((*i)->IsLocal() && (*i)->GetLocal().IsStr())
00279             (*i)->SetLocal().SetStr(title);
00280         if (!(*i)->IsGeneral() || (*i)->GetGeneral().GetDb() != "Cdd")
00281             keep.push_back(*i);
00282     }
00283     copy.SetPssm().SetQuery().SetSeq().SetId() = keep;
00284 
00285     CSeq_descr::Tdata::iterator d, de = copy.SetPssm().SetQuery().SetSeq().SetDescr().Set().end();
00286     for (d=copy.SetPssm().SetQuery().SetSeq().SetDescr().Set().begin(); d!=de; ++d) {
00287         if ((*d)->IsTitle()) {
00288             (*d)->SetTitle(title);
00289             break;
00290         }
00291     }
00292     if (d == de) {
00293         CRef < CSeqdesc > descr(new CSeqdesc);
00294         descr->SetTitle(title);
00295         copy.SetPssm().SetQuery().SetSeq().SetDescr().Set().push_front(descr);
00296     }
00297 
00298     // do not put scores in output pssm, only freq ratios
00299     copy.SetPssm().ResetFinalData();
00300     if (!copy.GetPssm().IsSetIntermediateData() || !copy.GetPssm().GetIntermediateData().IsSetFreqRatios())
00301         ERRORMSG("PSSM is missing frequency ratios");
00302 
00303     CObjectOStreamAsn osa(os, false);
00304     osa << copy;
00305 }
00306 
00307 static inline int Round(double Num)
00308 {
00309   if (Num >= 0)
00310     return((int)(Num + 0.5));
00311   else
00312     return((int)(Num - 0.5));
00313 }
00314 
00315 int PSSMWrapper::GetPSSMScore(unsigned char ncbistdaa, unsigned int realMasterIndex) const
00316 {
00317     if (ncbistdaa >= 28 || realMasterIndex > multiple->GetMaster()->Length()) {
00318         ERRORMSG("PSSMWrapper::GetPSSMScore() - invalid parameters");
00319         return kMin_Int;
00320     }
00321 
00322     // maps to a position in the consensus/pssm
00323     int consensusIndex = master2consensus[realMasterIndex];
00324     if (consensusIndex >= 0) {
00325         double scaledScore;
00326         switch (ncbistdaa) {
00327             case 2:  // B -> average D/N
00328                 scaledScore = ((double) (scaledMatrix[consensusIndex][4] + scaledMatrix[consensusIndex][13])) / 2;
00329                 break;
00330             case 23: // Z -> average E/Q
00331                 scaledScore = ((double) (scaledMatrix[consensusIndex][5] + scaledMatrix[consensusIndex][15])) / 2;
00332                 break;
00333             case 24: // U -> C
00334                 scaledScore = scaledMatrix[consensusIndex][3];
00335                 break;
00336             case 26: // O -> K
00337                 scaledScore = scaledMatrix[consensusIndex][10];
00338                 break;
00339             case 27: // J -> average I/L
00340                 scaledScore = ((double) (scaledMatrix[consensusIndex][9] + scaledMatrix[consensusIndex][11])) / 2;
00341                 break;
00342             default:
00343                 scaledScore = scaledMatrix[consensusIndex][ncbistdaa];
00344         }
00345         return Round(scaledScore / scalingFactor);
00346     }
00347 
00348     // use simple blosum62 score if outside the consensus/pssm
00349     return GetBLOSUM62Score(LookupCharacterFromNCBIStdaaNumber(ncbistdaa), multiple->GetMaster()->sequenceString[realMasterIndex]);
00350 }
00351 
00352 END_SCOPE(Cn3D)
Modified on Wed Aug 20 19:58:13 2014 by modify_doxy.py rev. 426318