|
NCBI C++ ToolKit
|
00001 /* $Id: blastdbcp.cpp 52305 2011-12-15 14:16:31Z fongah2 $ 00002 * =========================================================================== 00003 * 00004 * PUBLIC DOMAIN NOTICE 00005 * National Center for Biotechnology Information 00006 * 00007 * This software/database is a "United States Government Work" under the 00008 * terms of the United States Copyright Act. It was written as part of 00009 * the author's official duties as a United States Government employee and 00010 * thus cannot be copyrighted. This software/database is freely available 00011 * to the public for use. The National Library of Medicine and the U.S. 00012 * Government have not placed any restriction on its use or reproduction. 00013 * 00014 * Although all reasonable efforts have been taken to ensure the accuracy 00015 * and reliability of the software and data, the NLM and the U.S. 00016 * Government do not and cannot warrant the performance or results that 00017 * may be obtained by using this software or data. The NLM and the U.S. 00018 * Government disclaim all warranties, express or implied, including 00019 * warranties of performance, merchantability or fitness for any particular 00020 * purpose. 00021 * 00022 * Please cite the author in any work or product based on this material. 00023 * 00024 * =========================================================================== 00025 */ 00026 /** @file blastdbcp.cpp 00027 * @author Christiam Camacho 00028 */ 00029 00030 #include <ncbi_pch.hpp> 00031 #include <corelib/ncbiapp.hpp> 00032 #include <algo/blast/blastinput/cmdline_flags.hpp> 00033 #include <objtools/blast/seqdb_writer/build_db.hpp> 00034 00035 USING_NCBI_SCOPE; 00036 USING_SCOPE(blast); 00037 00038 00039 ///////////////////////////////////////////////////////////////////////////// 00040 // BlastdbCopyApplication:: 00041 00042 00043 class BlastdbCopyApplication : public CNcbiApplication 00044 { 00045 public: 00046 BlastdbCopyApplication(); 00047 00048 private: /* Private Methods */ 00049 virtual void Init(void); 00050 virtual int Run(void); 00051 virtual void Exit(void); 00052 00053 bool x_ShouldParseSeqIds(const string& dbname, 00054 CSeqDB::ESeqType seq_type) const; 00055 00056 bool x_ShouldCopyPIGs(const string& dbname, 00057 CSeqDB::ESeqType seq_type) const; 00058 00059 private: /* Private Data */ 00060 bool m_bCheckOnly; 00061 }; 00062 00063 ///////////////////////////////////////////////////////////////////////////// 00064 // Constructor 00065 00066 BlastdbCopyApplication::BlastdbCopyApplication() 00067 : m_bCheckOnly(false) 00068 { 00069 CRef<CVersion> version(new CVersion()); 00070 version->SetVersionInfo(1, 0); 00071 SetFullVersion(version); 00072 } 00073 00074 00075 ///////////////////////////////////////////////////////////////////////////// 00076 // Init test for all different types of arguments 00077 00078 00079 void BlastdbCopyApplication::Init(void) 00080 { 00081 // Create command-line argument descriptions class 00082 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions); 00083 00084 // Specify USAGE context 00085 arg_desc->SetUsageContext(GetArguments().GetProgramBasename(), 00086 "Performs a (deep) copy of a subset of a BLAST database"); 00087 00088 arg_desc->SetCurrentGroup("BLAST database options"); 00089 arg_desc->AddDefaultKey(kArgDb, "dbname", "BLAST database name", 00090 CArgDescriptions::eString, "nr"); 00091 00092 arg_desc->AddDefaultKey(kArgDbType, "molecule_type", 00093 "Molecule type stored in BLAST database", 00094 CArgDescriptions::eString, "prot"); 00095 arg_desc->SetConstraint(kArgDbType, &(*new CArgAllow_Strings, 00096 "nucl", "prot", "guess")); 00097 00098 arg_desc->SetCurrentGroup("Configuration options"); 00099 arg_desc->AddOptionalKey(kArgDbTitle, "database_title", 00100 "Title for BLAST database", 00101 CArgDescriptions::eString); 00102 arg_desc->AddKey(kArgGiList, "input_file", 00103 "Text or binary gi file to restrict the BLAST " 00104 "database provided in -db argument", 00105 CArgDescriptions::eString); 00106 arg_desc->AddFlag("membership_bits", "Copy the membershi bits", true); 00107 00108 arg_desc->SetCurrentGroup("Output options"); 00109 arg_desc->AddOptionalKey(kArgOutput, "database_name", 00110 "Name of BLAST database to be created", 00111 CArgDescriptions::eString); 00112 HideStdArgs(fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun); 00113 SetupArgDescriptions(arg_desc.release()); 00114 } 00115 00116 class CBlastDbBioseqSource : public IBioseqSource 00117 { 00118 public: 00119 CBlastDbBioseqSource(CRef<CSeqDBExpert> blastdb, 00120 CRef<CSeqDBGiList> gilist, 00121 bool copy_membership_bits = false) 00122 { 00123 CStopWatch total_timer, bioseq_timer, memb_timer; 00124 total_timer.Start(); 00125 for (int i = 0; i < gilist->GetNumGis(); i++) { 00126 const CSeqDBGiList::SGiOid& elem = gilist->GetGiOid(i); 00127 int oid = 0; 00128 if ( !blastdb->GiToOid(elem.gi, oid)) { 00129 // not found on source BLASTDB, skip 00130 continue; 00131 } 00132 if (m_Oids2Copy.insert(oid).second == false) { 00133 // don't add the same OID twice to avoid duplicates 00134 continue; 00135 } 00136 bioseq_timer.Start(); 00137 CConstRef<CBioseq> bs(&*blastdb->GetBioseq(oid)); 00138 m_Bioseqs.push_back(bs); 00139 bioseq_timer.Stop(); 00140 00141 if (copy_membership_bits == false) 00142 continue; 00143 00144 memb_timer.Start(); 00145 CRef<CBlast_def_line_set> hdr = CSeqDB::ExtractBlastDefline(*bs); 00146 ITERATE(CBlast_def_line_set::Tdata, itr, hdr->Get()) { 00147 CRef<CBlast_def_line> bdl = *itr; 00148 if (bdl->CanGetMemberships() && 00149 !bdl->GetMemberships().empty()) { 00150 int memb_bits = bdl->GetMemberships().front(); 00151 if (memb_bits == 0) { 00152 continue; 00153 } 00154 const string id = bdl->GetSeqid().front()->AsFastaString(); 00155 m_MembershipBits[memb_bits].push_back(id); 00156 } 00157 } 00158 memb_timer.Stop(); 00159 } 00160 total_timer.Stop(); 00161 ERR_POST(Info << "Will extract " << m_Bioseqs.size() 00162 << " sequences from the source database"); 00163 ERR_POST(Info << "Processed all input data in " << total_timer.AsSmartString()); 00164 ERR_POST(Info << "Processed bioseqs in " << bioseq_timer.AsSmartString()); 00165 ERR_POST(Info << "Processed membership bits in " << memb_timer.AsSmartString()); 00166 } 00167 00168 const TLinkoutMap GetMembershipBits() const { 00169 return m_MembershipBits; 00170 } 00171 00172 virtual CConstRef<CBioseq> GetNext() 00173 { 00174 if (m_Bioseqs.empty()) { 00175 return CConstRef<CBioseq>(0); 00176 } 00177 CConstRef<CBioseq> retval = m_Bioseqs.back(); 00178 m_Bioseqs.pop_back(); 00179 return retval; 00180 } 00181 private: 00182 typedef list< CConstRef<CBioseq> > TBioseqs; 00183 TBioseqs m_Bioseqs; 00184 set<int> m_Oids2Copy; 00185 TLinkoutMap m_MembershipBits; 00186 }; 00187 00188 bool BlastdbCopyApplication::x_ShouldParseSeqIds(const string& dbname, 00189 CSeqDB::ESeqType seq_type) const 00190 { 00191 vector<string> file_paths; 00192 CSeqDB::FindVolumePaths(dbname, seq_type, file_paths); 00193 const char type = (seq_type == CSeqDB::eProtein ? 'p' : 'n'); 00194 bool retval = false; 00195 const char* isam_extensions[] = { "si", "sd", "ni", "nd", NULL }; 00196 00197 ITERATE(vector<string>, f, file_paths) { 00198 for (int i = 0; isam_extensions[i] != NULL; i++) { 00199 CNcbiOstrstream oss; 00200 oss << *f << "." << type << isam_extensions[i]; 00201 const string fname = CNcbiOstrstreamToString(oss); 00202 CFile file(fname); 00203 if (file.Exists() && file.GetLength() > 0) { 00204 retval = true; 00205 break; 00206 } 00207 } 00208 if (retval) break; 00209 } 00210 return retval; 00211 } 00212 00213 bool BlastdbCopyApplication::x_ShouldCopyPIGs(const string& dbname, 00214 CSeqDB::ESeqType seq_type) const 00215 { 00216 if(CSeqDB::eProtein != seq_type) 00217 return false; 00218 00219 vector<string> file_paths; 00220 CSeqDB::FindVolumePaths(dbname, CSeqDB::eProtein, file_paths); 00221 ITERATE(vector<string>, f, file_paths) { 00222 CNcbiOstrstream oss; 00223 oss << *f << "." << "ppd"; 00224 const string fname = CNcbiOstrstreamToString(oss); 00225 CFile file(fname); 00226 if (file.Exists() && file.GetLength() > 0) 00227 return true; 00228 } 00229 return false; 00230 } 00231 00232 00233 ///////////////////////////////////////////////////////////////////////////// 00234 // Run the program 00235 int BlastdbCopyApplication::Run(void) 00236 { 00237 int retval = 0; 00238 const CArgs& args = GetArgs(); 00239 00240 // Setup Logging 00241 if (args["logfile"]) { 00242 SetDiagPostLevel(eDiag_Info); 00243 SetDiagPostFlag(eDPF_All); 00244 time_t now = time(0); 00245 LOG_POST( Info << string(72,'-') << "\n" << "NEW LOG - " << ctime(&now) ); 00246 } 00247 00248 CSeqDB::ESeqType seq_type = CSeqDB::eUnknown; 00249 try {{ 00250 00251 seq_type = ParseMoleculeTypeString(args[kArgDbType].AsString()); 00252 CRef<CSeqDBGiList> gilist(new CSeqDBFileGiList(args[kArgGiList].AsString())); 00253 CRef<CSeqDBExpert> sourcedb(new CSeqDBExpert(args[kArgDb].AsString(), seq_type)); 00254 string title; 00255 if (args[kArgDbTitle].HasValue()) { 00256 title = args[kArgDbTitle].AsString(); 00257 } else { 00258 CNcbiOstrstream oss; 00259 oss << "Copy of '" << sourcedb->GetDBNameList() << "': " << sourcedb->GetTitle(); 00260 title = CNcbiOstrstreamToString(oss); 00261 } 00262 00263 const bool kCopyPIGs = x_ShouldCopyPIGs(args[kArgDb].AsString(), 00264 seq_type); 00265 CBlastDbBioseqSource bioseq_source(sourcedb, gilist, 00266 args["membership_bits"]); 00267 const bool kIsSparse = false; 00268 const bool kParseSeqids = x_ShouldParseSeqIds(args[kArgDb].AsString(), 00269 seq_type); 00270 00271 00272 const bool kUseGiMask = false; 00273 CStopWatch timer; 00274 timer.Start(); 00275 CBuildDatabase destdb(args[kArgOutput].AsString(), title, 00276 static_cast<bool>(seq_type == CSeqDB::eProtein), 00277 kIsSparse, kParseSeqids, kUseGiMask, 00278 &(args["logfile"].HasValue() 00279 ? args["logfile"].AsOutputFile() : cerr)); 00280 destdb.SetUseRemote(false); 00281 //destdb.SetVerbosity(true); 00282 destdb.SetSourceDb(sourcedb); 00283 destdb.StartBuild(); 00284 destdb.SetMembBits(bioseq_source.GetMembershipBits(), false); 00285 destdb.AddSequences(bioseq_source, kCopyPIGs); 00286 destdb.EndBuild(); 00287 timer.Stop(); 00288 ERR_POST(Info << "Created BLAST database in " << timer.AsSmartString()); 00289 }} 00290 catch (const CException& ex) { 00291 LOG_POST( Error << ex ); 00292 DeleteBlastDb(args[kArgOutput].AsString(), seq_type); 00293 retval = -1; 00294 } 00295 catch (...) { 00296 LOG_POST( Error << "Unknown error in BlastdbCopyApplication::Run()" ); 00297 DeleteBlastDb(args[kArgOutput].AsString(), seq_type); 00298 retval = -2; 00299 } 00300 00301 return retval; 00302 } 00303 00304 ///////////////////////////////////////////////////////////////////////////// 00305 // Cleanup 00306 00307 00308 void BlastdbCopyApplication::Exit(void) 00309 { 00310 SetDiagStream(0); 00311 } 00312 00313 00314 ///////////////////////////////////////////////////////////////////////////// 00315 // MAIN 00316 00317 00318 int main(int argc, const char* argv[]) 00319 { 00320 // Execute main application function 00321 return BlastdbCopyApplication().AppMain(argc, argv, 0, eDS_Default, 0); 00322 }
1.7.5.1
Modified on Wed May 23 13:09:42 2012 by modify_doxy.py rev. 337098