NCBI C++ ToolKit
blastdbcp.cpp
Go to the documentation of this file.
00001 /*  $Id: blastdbcp.cpp 52305 2011-12-15 14:16:31Z fongah2 $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  */
00026 /** @file blastdbcp.cpp
00027  * @author Christiam Camacho
00028  */
00029 
00030 #include <ncbi_pch.hpp>
00031 #include <corelib/ncbiapp.hpp>
00032 #include <algo/blast/blastinput/cmdline_flags.hpp>
00033 #include <objtools/blast/seqdb_writer/build_db.hpp>
00034 
00035 USING_NCBI_SCOPE;
00036 USING_SCOPE(blast);
00037 
00038 
00039 /////////////////////////////////////////////////////////////////////////////
00040 //  BlastdbCopyApplication::
00041 
00042 
00043 class BlastdbCopyApplication : public CNcbiApplication
00044 {
00045 public:
00046     BlastdbCopyApplication();
00047 
00048 private: /* Private Methods */
00049     virtual void Init(void);
00050     virtual int  Run(void);
00051     virtual void Exit(void);
00052 
00053     bool x_ShouldParseSeqIds(const string& dbname, 
00054                              CSeqDB::ESeqType seq_type) const;
00055 
00056     bool x_ShouldCopyPIGs(const string& dbname,
00057                           CSeqDB::ESeqType seq_type) const;
00058 
00059 private: /* Private Data */
00060     bool    m_bCheckOnly;
00061 };
00062 
00063 /////////////////////////////////////////////////////////////////////////////
00064 //  Constructor
00065 
00066 BlastdbCopyApplication::BlastdbCopyApplication()
00067   : m_bCheckOnly(false)
00068 {
00069     CRef<CVersion> version(new CVersion());
00070     version->SetVersionInfo(1, 0);
00071     SetFullVersion(version);
00072 }
00073 
00074 
00075 /////////////////////////////////////////////////////////////////////////////
00076 //  Init test for all different types of arguments
00077 
00078 
00079 void BlastdbCopyApplication::Init(void)
00080 {
00081     // Create command-line argument descriptions class
00082     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
00083 
00084     // Specify USAGE context
00085     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
00086                               "Performs a (deep) copy of a subset of a BLAST database");
00087 
00088     arg_desc->SetCurrentGroup("BLAST database options");
00089     arg_desc->AddDefaultKey(kArgDb, "dbname", "BLAST database name", 
00090                             CArgDescriptions::eString, "nr");
00091 
00092     arg_desc->AddDefaultKey(kArgDbType, "molecule_type",
00093                             "Molecule type stored in BLAST database",
00094                             CArgDescriptions::eString, "prot");
00095     arg_desc->SetConstraint(kArgDbType, &(*new CArgAllow_Strings,
00096                                         "nucl", "prot", "guess"));
00097 
00098     arg_desc->SetCurrentGroup("Configuration options");
00099     arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
00100                              "Title for BLAST database",
00101                              CArgDescriptions::eString);
00102     arg_desc->AddKey(kArgGiList, "input_file", 
00103                      "Text or binary gi file to restrict the BLAST "
00104                      "database provided in -db argument",
00105                      CArgDescriptions::eString);
00106     arg_desc->AddFlag("membership_bits", "Copy the membershi bits", true);
00107 
00108     arg_desc->SetCurrentGroup("Output options");
00109     arg_desc->AddOptionalKey(kArgOutput, "database_name",
00110                              "Name of BLAST database to be created",
00111                              CArgDescriptions::eString);
00112     HideStdArgs(fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun);
00113     SetupArgDescriptions(arg_desc.release());
00114 }
00115 
00116 class CBlastDbBioseqSource : public IBioseqSource
00117 {
00118 public:
00119     CBlastDbBioseqSource(CRef<CSeqDBExpert> blastdb,
00120                          CRef<CSeqDBGiList> gilist,
00121                          bool copy_membership_bits = false)
00122     {
00123         CStopWatch total_timer, bioseq_timer, memb_timer;
00124         total_timer.Start();
00125         for (int i = 0; i < gilist->GetNumGis(); i++) {
00126             const CSeqDBGiList::SGiOid& elem = gilist->GetGiOid(i);
00127             int oid = 0;
00128             if ( !blastdb->GiToOid(elem.gi, oid)) {
00129                 // not found on source BLASTDB, skip
00130                 continue;
00131             }
00132             if (m_Oids2Copy.insert(oid).second == false) {
00133                 // don't add the same OID twice to avoid duplicates
00134                 continue;
00135             }
00136             bioseq_timer.Start();
00137             CConstRef<CBioseq> bs(&*blastdb->GetBioseq(oid));
00138             m_Bioseqs.push_back(bs);
00139             bioseq_timer.Stop();
00140 
00141             if (copy_membership_bits == false)
00142                 continue;
00143 
00144             memb_timer.Start();
00145             CRef<CBlast_def_line_set> hdr = CSeqDB::ExtractBlastDefline(*bs);
00146             ITERATE(CBlast_def_line_set::Tdata, itr, hdr->Get()) {
00147                 CRef<CBlast_def_line> bdl = *itr;
00148                 if (bdl->CanGetMemberships() && 
00149                     !bdl->GetMemberships().empty()) {
00150                     int memb_bits = bdl->GetMemberships().front();
00151                     if (memb_bits == 0) {
00152                         continue;
00153                     }
00154                     const string id = bdl->GetSeqid().front()->AsFastaString();
00155                     m_MembershipBits[memb_bits].push_back(id);
00156                 }
00157             }
00158             memb_timer.Stop();
00159         }
00160         total_timer.Stop();
00161         ERR_POST(Info << "Will extract " << m_Bioseqs.size()
00162                       << " sequences from the source database");
00163         ERR_POST(Info << "Processed all input data in " << total_timer.AsSmartString());
00164         ERR_POST(Info << "Processed bioseqs in " << bioseq_timer.AsSmartString());
00165         ERR_POST(Info << "Processed membership bits in " << memb_timer.AsSmartString());
00166     }
00167 
00168     const TLinkoutMap GetMembershipBits() const {
00169         return m_MembershipBits;
00170     }
00171 
00172     virtual CConstRef<CBioseq> GetNext() 
00173     {
00174         if (m_Bioseqs.empty()) {
00175             return CConstRef<CBioseq>(0);
00176         }
00177         CConstRef<CBioseq> retval = m_Bioseqs.back();
00178         m_Bioseqs.pop_back();
00179         return retval;
00180     }
00181 private:
00182     typedef list< CConstRef<CBioseq> > TBioseqs;
00183     TBioseqs m_Bioseqs;
00184     set<int> m_Oids2Copy;
00185     TLinkoutMap m_MembershipBits;
00186 };
00187 
00188 bool BlastdbCopyApplication::x_ShouldParseSeqIds(const string& dbname,
00189                                                  CSeqDB::ESeqType seq_type) const
00190 {
00191     vector<string> file_paths;
00192     CSeqDB::FindVolumePaths(dbname, seq_type, file_paths);
00193     const char type = (seq_type == CSeqDB::eProtein ? 'p' : 'n');
00194     bool retval = false;
00195     const char* isam_extensions[] = { "si", "sd", "ni", "nd", NULL };
00196 
00197     ITERATE(vector<string>, f, file_paths) {
00198         for (int i = 0; isam_extensions[i] != NULL; i++) {
00199             CNcbiOstrstream oss;
00200             oss << *f << "." << type << isam_extensions[i];
00201             const string fname = CNcbiOstrstreamToString(oss);
00202             CFile file(fname);
00203             if (file.Exists() && file.GetLength() > 0) {
00204                 retval = true;
00205                 break;
00206             }
00207         }
00208         if (retval) break;
00209     }
00210     return retval;
00211 }
00212 
00213 bool BlastdbCopyApplication::x_ShouldCopyPIGs(const string& dbname,
00214                                               CSeqDB::ESeqType seq_type) const
00215 {
00216     if(CSeqDB::eProtein != seq_type)
00217         return false;
00218 
00219     vector<string> file_paths;
00220     CSeqDB::FindVolumePaths(dbname, CSeqDB::eProtein, file_paths);
00221     ITERATE(vector<string>, f, file_paths) {
00222         CNcbiOstrstream oss;
00223         oss << *f << "." << "ppd";
00224         const string fname = CNcbiOstrstreamToString(oss);
00225         CFile file(fname);
00226         if (file.Exists() && file.GetLength() > 0)
00227                 return true;
00228     }
00229      return false;
00230 }
00231 
00232 
00233 /////////////////////////////////////////////////////////////////////////////
00234 //  Run the program
00235 int BlastdbCopyApplication::Run(void)
00236 {
00237     int retval = 0;
00238     const CArgs& args = GetArgs();
00239 
00240     // Setup Logging
00241     if (args["logfile"]) {
00242         SetDiagPostLevel(eDiag_Info); 
00243         SetDiagPostFlag(eDPF_All); 
00244         time_t now = time(0);
00245         LOG_POST( Info << string(72,'-') << "\n" << "NEW LOG - " << ctime(&now) );
00246     }
00247 
00248     CSeqDB::ESeqType seq_type = CSeqDB::eUnknown;
00249     try {{
00250 
00251         seq_type = ParseMoleculeTypeString(args[kArgDbType].AsString());
00252         CRef<CSeqDBGiList> gilist(new CSeqDBFileGiList(args[kArgGiList].AsString()));
00253         CRef<CSeqDBExpert> sourcedb(new CSeqDBExpert(args[kArgDb].AsString(), seq_type));
00254         string title;
00255         if (args[kArgDbTitle].HasValue()) {
00256             title = args[kArgDbTitle].AsString();
00257         } else {
00258             CNcbiOstrstream oss;
00259             oss << "Copy of '" << sourcedb->GetDBNameList() << "': " << sourcedb->GetTitle();
00260             title = CNcbiOstrstreamToString(oss);
00261         }
00262 
00263         const bool kCopyPIGs = x_ShouldCopyPIGs(args[kArgDb].AsString(),
00264                                                               seq_type);
00265         CBlastDbBioseqSource bioseq_source(sourcedb, gilist,
00266                                            args["membership_bits"]);
00267         const bool kIsSparse = false;
00268         const bool kParseSeqids = x_ShouldParseSeqIds(args[kArgDb].AsString(),
00269                                                       seq_type);
00270 
00271 
00272         const bool kUseGiMask = false;
00273         CStopWatch timer;
00274         timer.Start();
00275         CBuildDatabase destdb(args[kArgOutput].AsString(), title,
00276                               static_cast<bool>(seq_type == CSeqDB::eProtein),
00277                               kIsSparse, kParseSeqids, kUseGiMask,
00278                               &(args["logfile"].HasValue() 
00279                                ? args["logfile"].AsOutputFile() : cerr));
00280         destdb.SetUseRemote(false);
00281         //destdb.SetVerbosity(true);
00282         destdb.SetSourceDb(sourcedb);
00283         destdb.StartBuild();
00284         destdb.SetMembBits(bioseq_source.GetMembershipBits(), false);
00285         destdb.AddSequences(bioseq_source, kCopyPIGs);
00286         destdb.EndBuild();
00287         timer.Stop();
00288         ERR_POST(Info << "Created BLAST database in " << timer.AsSmartString());
00289     }}
00290     catch (const CException& ex) {
00291         LOG_POST( Error << ex );
00292         DeleteBlastDb(args[kArgOutput].AsString(), seq_type);
00293         retval = -1;
00294     }
00295     catch (...) {
00296         LOG_POST( Error << "Unknown error in BlastdbCopyApplication::Run()" );
00297         DeleteBlastDb(args[kArgOutput].AsString(), seq_type);
00298         retval = -2;
00299     }
00300 
00301     return retval;
00302 }
00303 
00304 /////////////////////////////////////////////////////////////////////////////
00305 //  Cleanup
00306 
00307 
00308 void BlastdbCopyApplication::Exit(void)
00309 {
00310     SetDiagStream(0);
00311 }
00312 
00313 
00314 /////////////////////////////////////////////////////////////////////////////
00315 //  MAIN
00316 
00317 
00318 int main(int argc, const char* argv[])
00319 {
00320     // Execute main application function
00321     return BlastdbCopyApplication().AppMain(argc, argv, 0, eDS_Default, 0);
00322 }
Modified on Wed May 23 13:09:42 2012 by modify_doxy.py rev. 337098