NCBI C++ Toolkit Cross Reference

C++/src/util/format_guess.cpp


  1 /*  $Id: format_guess.cpp 173835 2009-10-21 15:07:35Z ludwigf $
  2  * ===========================================================================
  3  *
  4  *                            PUBLIC DOMAIN NOTICE
  5  *               National Center for Biotechnology Information
  6  *
  7  *  This software/database is a "United States Government Work" under the
  8  *  terms of the United States Copyright Act.  It was written as part of
  9  *  the author's official duties as a United States Government employee and
 10  *  thus cannot be copyrighted.  This software/database is freely available
 11  *  to the public for use. The National Library of Medicine and the U.S.
 12  *  Government have not placed any restriction on its use or reproduction.
 13  *
 14  *  Although all reasonable efforts have been taken to ensure the accuracy
 15  *  and reliability of the software and data, the NLM and the U.S.
 16  *  Government do not and cannot warrant the performance or results that
 17  *  may be obtained by using this software or data. The NLM and the U.S.
 18  *  Government disclaim all warranties, express or implied, including
 19  *  warranties of performance, merchantability or fitness for any particular
 20  *  purpose.
 21  *
 22  *  Please cite the author in any work or product based on this material.
 23  *
 24  * ===========================================================================
 25  *
 26  * Author: Anatoliy Kuznetsov
 27  *
 28  * File Description:  Implemented methods to identify file formats.
 29  *
 30  */
 31 
 32 #include <ncbi_pch.hpp>
 33 #include <util/format_guess.hpp>
 34 #include <util/util_exception.hpp>
 35 #include <corelib/ncbifile.hpp>
 36 #include <corelib/ncbistre.hpp>
 37 #include <corelib/stream_utils.hpp>
 38 
 39 BEGIN_NCBI_SCOPE
 40 
 41 enum ESymbolType {
 42     fDNA_Alphabet       = 1<<0,
 43     fProtein_Alphabet   = 1<<1,
 44     fLineEnd            = 1<<2,
 45     fAlpha              = 1<<3,
 46     fDigit              = 1<<4,
 47     fSpace              = 1<<5,
 48     fInvalid            = 1<<6
 49 };
 50 
 51 enum EConfidence {
 52     eNo = 0,
 53     eMaybe,
 54     eYes
 55 };
 56 
 57 
 58 //  ============================================================================
 59 //  Helper routine--- file scope only:
 60 //  ============================================================================
 61 
 62 static unsigned char symbol_type_table[256];
 63 
 64 //  ----------------------------------------------------------------------------
 65 static bool s_IsTokenPosInt(
 66     const string& strToken )
 67 {
 68     return ( -1 != NStr::StringToNumeric( strToken ) );
 69 }
 70 
 71 //  ----------------------------------------------------------------------------
 72 static bool s_IsTokenInteger(
 73     const string& strToken )
 74 //  ----------------------------------------------------------------------------
 75 {
 76     if ( ! strToken.empty() && strToken[0] == '-' ) {
 77         return s_IsTokenPosInt( strToken.substr( 1 ) );
 78     }
 79     return s_IsTokenPosInt( strToken );
 80 }
 81 
 82 //  ----------------------------------------------------------------------------
 83 static bool s_IsTokenDouble(
 84     const string& strToken )
 85 {
 86     string token( strToken );
 87     NStr::ReplaceInPlace( token, ".", "1", 0, 1 );
 88     if ( token.size() > 1 && token[0] == '-' ) {
 89         token[0] = '1';
 90     }
 91     return s_IsTokenPosInt( token );
 92 }
 93 
 94 //  ----------------------------------------------------------------------------
 95 static void init_symbol_type_table(void)
 96 {
 97     if ( symbol_type_table[0] == 0 ) {
 98         for ( const char* s = "ATGCN"; *s; ++s ) {
 99             unsigned char c = *s;
100             symbol_type_table[c] |= fDNA_Alphabet;
101             c = tolower(c);
102             symbol_type_table[c] |= fDNA_Alphabet;
103         }
104         for ( const char* s = "ACDEFGHIKLMNPQRSTVWYBZX"; *s; ++s ) {
105             unsigned char c = *s;
106             symbol_type_table[c] |= fProtein_Alphabet;
107             c = tolower(c);
108             symbol_type_table[c] |= fProtein_Alphabet;
109         }
110         for ( const char* s = "\r\n"; *s; ++s ) {
111             unsigned char c = *s;
112             symbol_type_table[c] |= fLineEnd;
113         }
114         for ( int c = 1; c < 256; ++c ) {
115             if ( isalpha(c) )
116                 symbol_type_table[c] |= fAlpha;
117             if ( isdigit(c) )
118                 symbol_type_table[c] |= fDigit;
119             if ( isspace(c) )
120                 symbol_type_table[c] |= fSpace;
121         }
122         symbol_type_table[0] |= fInvalid;
123     }
124 }
125 
126 
127 //  ============================================================================
128 //  Old style class interface:
129 //  ============================================================================
130 
131 //  ----------------------------------------------------------------------------
132 CFormatGuess::ESequenceType
133 CFormatGuess::SequenceType(const char* str, unsigned length)
134 {
135     if (length == 0)
136         length = (unsigned)::strlen(str);
137 
138     init_symbol_type_table();
139     unsigned ATGC_content = 0;
140     unsigned amino_acid_content = 0;
141 
142     for (unsigned i = 0; i < length; ++i) {
143         unsigned char c = str[i];
144         unsigned char type = symbol_type_table[c];
145         if ( type & fDNA_Alphabet ) {
146             ++ATGC_content;
147         }
148         if ( type & fProtein_Alphabet ) {
149             ++amino_acid_content;
150         }
151     }
152 
153     double dna_content = (double)ATGC_content / (double)length;
154     double prot_content = (double)amino_acid_content / (double)length;
155 
156     if (dna_content > 0.7) {
157         return eNucleotide;
158     }
159     if (prot_content > 0.7) {
160         return eProtein;
161     }
162     return eUndefined;
163 }
164 
165 
166 //  ----------------------------------------------------------------------------
167 CFormatGuess::EFormat CFormatGuess::Format(const string& path, EOnError onerror)
168 {
169     CNcbiIfstream input(path.c_str(), IOS_BASE::in | IOS_BASE::binary);
170     return Format(input);
171 }
172 
173 //  ----------------------------------------------------------------------------
174 CFormatGuess::EFormat CFormatGuess::Format(CNcbiIstream& input, EOnError onerror)
175 {
176     CFormatGuess FG( input );
177     return FG.GuessFormat( onerror );
178 }
179 
180 
181 //  ============================================================================
182 //  New style object interface:
183 //  ============================================================================
184 
185 //  ----------------------------------------------------------------------------
186 CFormatGuess::CFormatGuess()
187     : m_Stream( * new CNcbiIfstream )
188     , m_bOwnsStream( true )
189 {
190     Initialize();
191 }
192 
193 //  ----------------------------------------------------------------------------
194 CFormatGuess::CFormatGuess(
195     const string& FileName )
196     : m_Stream( * new CNcbiIfstream( FileName.c_str() ) )
197     , m_bOwnsStream( true )
198 {
199     Initialize();
200 }
201 
202 //  ----------------------------------------------------------------------------
203 CFormatGuess::CFormatGuess(
204     CNcbiIstream& Stream )
205     : m_Stream( Stream )
206     , m_bOwnsStream( false )
207 {
208     Initialize();
209 }
210 
211 //  ----------------------------------------------------------------------------
212 CFormatGuess::~CFormatGuess()
213 {
214     delete[] m_pTestBuffer;
215     if ( m_bOwnsStream ) {
216         delete &m_Stream;
217     }
218 }
219 
220 //  ----------------------------------------------------------------------------
221 CFormatGuess::EFormat
222 CFormatGuess::GuessFormat( EMode )
223 {
224     return GuessFormat(eDefault);
225 }
226 
227 //  ----------------------------------------------------------------------------
228 CFormatGuess::EFormat
229 CFormatGuess::GuessFormat(
230     EOnError onerror )
231 {
232     if (!x_TestInput(m_Stream, onerror)) {
233         return eUnknown;
234     }
235     EMode mode = eQuick;
236 
237     // First, try to use hints
238     if ( !m_Hints.IsEmpty() ) {
239         for (int f = 1 /* skip eUnknown */; f < eFormat_max; ++f) {
240             EFormat fmt = EFormat(f);
241             if (m_Hints.IsPreferred(fmt)  &&  x_TestFormat(fmt, mode)) {
242                 return fmt;
243             }
244         }
245     }
246 
247     // Check other formats
248     for (int f = 1 /* skip eUnknown */; f < eFormat_max; ++f) {
249         if ( x_TestFormat(EFormat(f), mode) ) {
250             return EFormat(f);
251         }
252     }
253     return eUnknown;
254 }
255 
256 //  ----------------------------------------------------------------------------
257 bool
258 CFormatGuess::TestFormat( EFormat format, EMode )
259 {
260     return TestFormat( format, eDefault);
261 }
262 
263 //  ----------------------------------------------------------------------------
264 bool
265 CFormatGuess::TestFormat(
266     EFormat format,
267     EOnError onerror )
268 {
269     if (format != eUnknown && !x_TestInput(m_Stream, onerror)) {
270         return false;
271     }
272     EMode mode = eQuick;
273     return x_TestFormat(format, mode);
274 }
275 
276 //  ----------------------------------------------------------------------------
277 bool CFormatGuess::x_TestFormat(EFormat format, EMode mode)
278 {
279     // First check if the format is disabled
280     if ( m_Hints.IsDisabled(format) ) {
281         return false;
282     }
283 
284     switch( format ) {
285 
286     case eBinaryASN:
287         return TestFormatBinaryAsn( mode );
288     case eRmo:
289         return TestFormatRepeatMasker( mode );
290     case eGtf:
291         return TestFormatGtf( mode );
292     case eGlimmer3:
293         return TestFormatGlimmer3( mode );
294     case eAgp:
295         return TestFormatAgp( mode );
296     case eXml:
297         return TestFormatXml( mode );
298     case eWiggle:
299         return TestFormatWiggle( mode );
300     case eBed:
301         return TestFormatBed( mode );
302     case eBed15:
303         return TestFormatBed15( mode );
304     case eNewick:
305         return TestFormatNewick( mode );
306     case eAlignment:
307         return TestFormatAlignment( mode );
308     case eDistanceMatrix:
309         return TestFormatDistanceMatrix( mode );
310     case eFlatFileSequence:
311         return TestFormatFlatFileSequence( mode );
312     case eFiveColFeatureTable:
313         return TestFormatFiveColFeatureTable( mode );
314     case eSnpMarkers:
315         return TestFormatSnpMarkers( mode );
316     case eFasta:
317         return TestFormatFasta( mode );
318     case eTextASN:
319         return TestFormatTextAsn( mode );
320     case eTaxplot:
321         return TestFormatTaxplot( mode );
322     case ePhrapAce:
323         return TestFormatPhrapAce( mode );
324     case eTable:
325         return TestFormatTable( mode );
326 
327     default:
328         NCBI_THROW( CCoreException, eInvalidArg,
329             "CFormatGuess::x_TestFormat(): Unsupported format ID." );
330     }
331 }
332 
333 //  ----------------------------------------------------------------------------
334 void
335 CFormatGuess::Initialize()
336 {
337     m_pTestBuffer = 0;
338 
339     m_bStatsAreValid = false;
340     m_bSplitDone = false;
341     m_iStatsCountData = 0;
342     m_iStatsCountAlNumChars = 0;
343     m_iStatsCountDnaChars = 0;
344     m_iStatsCountAaChars = 0;
345 }
346 
347 //  ----------------------------------------------------------------------------
348 bool
349 CFormatGuess::EnsureTestBuffer()
350 {
351     if ( m_pTestBuffer ) {
352         return true;
353     }
354     if ( ! m_Stream.good() ) {
355         return false;
356     }
357     m_pTestBuffer = new char[ s_iTestBufferSize ];
358     m_Stream.read( m_pTestBuffer, s_iTestBufferSize );
359     m_iTestDataSize = m_Stream.gcount();
360     m_Stream.clear();  // in case we reached eof
361     CStreamUtils::Stepback( m_Stream, m_pTestBuffer, m_iTestDataSize );
362     return true;
363 }
364 
365 //  ----------------------------------------------------------------------------
366 bool
367 CFormatGuess::EnsureStats()
368 {
369     if ( m_bStatsAreValid ) {
370         return true;
371     }
372     if ( ! EnsureTestBuffer() ) {
373         return false;
374     }
375     if ( m_iTestDataSize == 0 ) {
376         m_bStatsAreValid = true;
377         return true;
378     }
379 
380     CNcbiIstrstream TestBuffer(
381         reinterpret_cast<const char*>( m_pTestBuffer ), m_iTestDataSize );
382     string strLine;
383 
384     init_symbol_type_table();
385     // Things we keep track of:
386     //   m_iStatsCountAlNumChars: number of characters that are letters or
387     //     digits
388     //   m_iStatsCountData: number of characters not part of a line starting
389     //     with '>'
390     //   m_iStatsCountDnaChars: number of characters counted in m_iStatsCountData
391     //     from the DNA alphabet
392     //   m_iStatsCountAaChars: number of characters counted in m_iStatsCountData
393     //     from the AA alphabet
394     //
395     while ( ! TestBuffer.fail() ) {
396         NcbiGetlineEOL( TestBuffer, strLine );
397 // code in CFormatGuess::Format counts line ends
398 // so, we will count them here as well
399         if (!strLine.empty()) {
400             strLine += '\n';
401         }
402         size_t size = strLine.size();
403         bool is_header = size > 0 && strLine[0] == '>';
404         for ( size_t i=0; i < size; ++i ) {
405             unsigned char c = strLine[i];
406             unsigned char type = symbol_type_table[c];
407 
408             if ( type & (fAlpha | fDigit | fSpace) ) {
409                 ++m_iStatsCountAlNumChars;
410             }
411             if ( !is_header ) {
412                 ++m_iStatsCountData;
413 
414                 if ( type & fDNA_Alphabet ) {
415                     ++m_iStatsCountDnaChars;
416                 }
417                 if ( type & fProtein_Alphabet ) {
418                     ++m_iStatsCountAaChars;
419                 }
420                 if ( type & fLineEnd ) {
421                     ++m_iStatsCountAlNumChars;
422                     --m_iStatsCountData;
423                 }
424             }
425         }
426     }
427     m_bStatsAreValid = true;
428     return true;
429 }
430 
431 //  ----------------------------------------------------------------------------
432 bool CFormatGuess::x_TestInput( CNcbiIstream& input, EOnError onerror )
433 {
434     if (!input) {
435         if (onerror == eThrowOnBadSource) {
436             NCBI_THROW(CUtilException,eNoInput,"Unreadable input stream");
437         }
438         return false;
439     }
440     return true;
441 }
442 
443 //  ----------------------------------------------------------------------------
444 bool
445 CFormatGuess::TestFormatRepeatMasker(
446     EMode /* not used */ )
447 {
448     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
449         return false;
450     }
451     return IsInputRepeatMaskerWithHeader() ||
452         IsInputRepeatMaskerWithoutHeader();
453 }
454 
455 //  ----------------------------------------------------------------------------
456 bool
457 CFormatGuess::TestFormatPhrapAce(
458     EMode /* not used */ )
459 {
460     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
461         return false;
462     }
463 
464     ITERATE( list<string>, it, m_TestLines ) {
465         if ( IsLinePhrapId( *it ) ) {
466             return true;
467         }
468     }
469     return false;
470 }
471 
472 //  -----------------------------------------------------------------------------
473 bool
474 CFormatGuess::TestFormatGtf(
475     EMode /* not used */ )
476 {
477     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
478         return false;
479     }
480 
481     unsigned int uGtfLineCount = 0;
482     list<string>::iterator it = m_TestLines.begin();
483     for ( ;  it != m_TestLines.end();  ++it) {
484         if ( !it->empty()  &&  (*it)[0] != '#') {
485             break;
486         }
487     }
488 
489     for ( ;  it != m_TestLines.end();  ++it) {
490         //
491         //  Make sure to ignore any UCSC track and browser lines prior to the
492         //  start of data
493         //
494         if ( !uGtfLineCount && NStr::StartsWith( *it, "browser " ) ) {
495             continue;
496         }
497         if ( !uGtfLineCount && NStr::StartsWith( *it, "track " ) ) {
498             continue;
499         }
500         if ( ! IsLineGtf( *it ) ) {
501             return false;
502         }
503         ++uGtfLineCount;
504     }
505     return (uGtfLineCount != 0);
506 }
507 
508 //  -----------------------------------------------------------------------------
509 bool
510 CFormatGuess::TestFormatGlimmer3(
511     EMode /* not used */ )
512 {
513     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
514         return false;
515     }
516 
517     /// first line should be a FASTA defline
518     list<string>::iterator it = m_TestLines.begin();
519     if (it->empty()  ||  (*it)[0] != '>') {
520         return false;
521     }
522 
523     /// next lines should be easily parseable, with five columns
524     for (++it;  it != m_TestLines.end();  ++it) {
525         if ( IsLineGlimmer3( *it ) ) {
526             return true;
527         }
528     }
529 
530     return false;
531 }
532 
533 //  -----------------------------------------------------------------------------
534 bool
535 CFormatGuess::TestFormatAgp(
536     EMode /* not used */ )
537 {
538     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
539         return false;
540     }
541     ITERATE( list<string>, it, m_TestLines ) {
542         if ( ! IsLineAgp( *it ) ) {
543             return false;
544         }
545     }
546     return true;
547 }
548 
549 //  -----------------------------------------------------------------------------
550 bool
551 CFormatGuess::TestFormatNewick(
552     EMode /* not used */ )
553 {
554     if ( ! EnsureTestBuffer() ) {
555         return false;
556     }
557     // Maybe we get home early ...
558     if ( m_iTestDataSize > 0 && m_pTestBuffer[0] != '(' ) {
559         return false;
560     }
561     if ( ! EnsureSplitLines() ) {
562         return false;
563     }
564 
565     string one_line;
566     ITERATE( list<string>, it, m_TestLines ) {
567         one_line += *it;
568     }
569 
570     if ( ! IsLineNewick( one_line ) ) {
571         return false;
572     }
573     return true;
574 }
575 
576 //  -----------------------------------------------------------------------------
577 bool
578 CFormatGuess::TestFormatBinaryAsn(
579     EMode /* not used */ )
580 {
581     if ( ! EnsureTestBuffer() ) {
582         return false;
583     }
584 
585     //
586     //  Criterion: Presence of any non-printing characters
587     //
588     EConfidence conf = eNo;
589     for (int i = 0;  i < m_iTestDataSize;  ++i) {
590         if ( !isgraph((unsigned char) m_pTestBuffer[i])  &&
591              !isspace((unsigned char) m_pTestBuffer[i]) )
592         {
593             if (m_pTestBuffer[i] == '\1') {
594                 conf = eMaybe;
595             } else {
596                 return true;
597             }
598         }
599     }
600     return (conf == eYes);
601 }
602 
603 
604 //  -----------------------------------------------------------------------------
605 bool
606 CFormatGuess::TestFormatDistanceMatrix(
607     EMode /* not used */ )
608 {
609     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
610         return false;
611     }
612 
613     //
614     // criteria are odd:
615     //
616     list<string>::const_iterator iter = m_TestLines.begin();
617     list<string> toks;
618 
619     /// first line: one token, one number
620     NStr::Split(*iter++, "\t ", toks);
621     if (toks.size() != 1  ||
622         toks.front().find_first_not_of("0123456789") != string::npos) {
623         return false;
624     }
625 
626     // now, for remaining ones, we expect an alphanumeric item first,
627     // followed by a set of floating-point values.  Unless we are at the last
628     // line, the number of values should increase monotonically
629     for (size_t i = 1;  iter != m_TestLines.end();  ++i, ++iter) {
630         toks.clear();
631         NStr::Split(*iter, "\t ", toks);
632         if (toks.size() != i) {
633             /// we can ignore the last line ; it may be truncated
634             list<string>::const_iterator it = iter;
635             ++it;
636             if (it != m_TestLines.end()) {
637                 return false;
638             }
639         }
640 
641         list<string>::const_iterator it = toks.begin();
642         for (++it;  it != toks.end();  ++it) {
643             if ( ! s_IsTokenDouble( *it ) ) {
644                 return false;
645             }
646         }
647     }
648 
649     return true;
650 }
651 
652 //  -----------------------------------------------------------------------------
653 bool
654 CFormatGuess::TestFormatFlatFileSequence(
655     EMode /* not used */ )
656 {
657     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
658         return false;
659     }
660 
661     ITERATE (list<string>, it, m_TestLines) {
662         if ( ! IsLineFlatFileSequence( *it ) ) {
663             return false;
664         }
665     }
666     return true;
667 }
668 
669 //  -----------------------------------------------------------------------------
670 bool
671 CFormatGuess::TestFormatFiveColFeatureTable(
672     EMode /* not used */ )
673 {
674     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
675         return false;
676     }
677 
678     ITERATE( list<string>, it, m_TestLines ) {
679         if (it->empty()) {
680             continue;
681         }
682 
683         if (it->find(">Feature ") != 0) {
684             return false;
685         }
686         if (it->find_first_of(" \t", 9) != string::npos) {
687             return false;
688         }
689         break;
690     }
691 
692     return true;
693 }
694 
695 //  -----------------------------------------------------------------------------
696 bool
697 CFormatGuess::TestFormatXml(
698     EMode /* not used */ )
699 {
700     if ( ! EnsureTestBuffer() ) {
701         return false;
702     }
703 
704     string input( m_pTestBuffer, m_iTestDataSize );
705     NStr::TruncateSpacesInPlace( input, NStr::eTrunc_Begin );
706 
707     //
708     //  Test 1: If it starts with typical XML decorations such as "<?xml..."
709     //  then respect that:
710     //
711     if ( NStr::StartsWith( input, "<?XML", NStr::eNocase ) ) {
712         return true;
713     }
714     if ( NStr::StartsWith( input, "<!DOCTYPE", NStr::eNocase ) ) {
715         return true;
716     }
717 
718     //
719     //  Test 2: In the absence of XML specific declarations, check whether the
720     //  input starts with the opening tag of a well known set of doc types:
721     //
722     static const char* known_types[] = {
723         "<Blast4-request>"
724     };
725     const int num_types = sizeof( known_types ) / sizeof( const char* );
726 
727     for ( int i=0; i < num_types; ++i ) {
728         if ( NStr::StartsWith( input, known_types[i], NStr::eCase ) ) {
729             return true;
730         }
731     }
732 
733     return false;
734 }
735 
736 //  -----------------------------------------------------------------------------
737 bool
738 CFormatGuess::TestFormatAlignment(
739     EMode /* not used */ )
740 {
741     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
742         return false;
743     }
744 
745     // Alignment files come in all different shapes and broken formats,
746     // and some of them are hard to recognize as such, in particular
747     // if they have been hacked up in a text editor.
748 
749     // This functions only concerns itself with the ones that are
750     // easy to recognize.
751 
752     // Note: We can live with false negatives. Avoid false positives
753     // at all cost.
754 
755     ITERATE( list<string>, it, m_TestLines ) {
756         if ( NPOS != it->find( "#NEXUS" ) ) {
757             return true;
758         }
759         if ( NPOS != it->find( "CLUSTAL" ) ) {
760             return true;
761         }
762     }
763     return false;
764 }
765 
766 //  -----------------------------------------------------------------------------
767 bool
768 CFormatGuess::TestFormatTable(
769     EMode /* not used */ )
770 {
771     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
772         return false;
773     }
774 
775     //
776     //  NOTE 1:
777     //  There is a bunch of file formats that are a special type of table and
778     //  that we want to identify (like Repeat Masker output). So not to shade
779     //  out those more special formats, this test should be performed only after
780     //  all the more specialized table formats have been tested.
781     //
782 
783     //
784     //  NOTE 2:
785     //  The original criterion for this test was "the same number of observed
786     //  columns in every line".
787     //  In order to weed out false positives the following *additional*
788     //  conditions have been imposed:
789     //  - there are at least two observed columns
790     //  - the sample contains at least two non-comment lines.
791     //
792 
793     list<string>::const_iterator iter = m_TestLines.begin();
794     list<string> toks;
795 
796     /// determine the number of observed columns
797     size_t ncols = 0;
798     bool found = false;
799     for ( ;  iter != m_TestLines.end()  &&  ! found;  ++iter) {
800         if (iter->empty()  ||  (*iter)[0] == '#'  ||  (*iter)[0] == ';') {
801             continue;
802         }
803 
804         toks.clear();
805         NStr::Split(*iter, " \t,", toks);
806         ncols = toks.size();
807         found = true;
808     }
809     if ( ncols < 2 ) {
810         return false;
811     }
812 
813     size_t nlines = 1;
814     // verify that columns all have the same size
815     // we can add an exception for the last line
816     for ( ;  iter != m_TestLines.end();  ++iter) {
817         if (iter->empty()  ||  (*iter)[0] == '#'  ||  (*iter)[0] == ';') {
818             continue;
819         }
820 
821         toks.clear();
822         NStr::Split(*iter, " \t,", toks);
823         if (toks.size() != ncols) {
824             list<string>::const_iterator it = iter;
825             ++it;
826             if (it != m_TestLines.end() || (m_iTestDataSize < s_iTestBufferSize) ) {
827                 return false;
828             }
829         } else {
830             ++nlines;
831         }
832     }
833     return ( nlines >= 2 );
834 }
835 
836 //  -----------------------------------------------------------------------------
837 bool
838 CFormatGuess::TestFormatFasta(
839     EMode /* not used */ )
840 {
841     if ( ! EnsureStats() ) {
842         return false;
843     }
844 
845     // reject obvious misfits:
846     if ( m_iTestDataSize == 0 || m_pTestBuffer[0] != '>' ) {
847         return false;
848     }
849     if ( m_iStatsCountData == 0 ) {
850         if (0.75 > double(m_iStatsCountAlNumChars)/double(m_iTestDataSize) ) {
851             return false;
852         }
853         return ( NStr::Find( m_pTestBuffer, "|" ) <= 10 );
854     }
855 
856     // remaining decision based on text stats:
857     double dAlNumFraction =  (double)m_iStatsCountAlNumChars / m_iTestDataSize;
858     double dDnaFraction = (double)m_iStatsCountDnaChars / m_iStatsCountData;
859     double dAaFraction = (double)m_iStatsCountAaChars / m_iStatsCountData;
860 
861     // want at least 80% text-ish overall:
862     if ( dAlNumFraction < 0.8 ) {
863         return false;
864     }
865 
866     // want more than 91 percent of either DNA content or AA content in what we
867     // presume is data:
868     if ( dDnaFraction > 0.91 || dAaFraction > 0.91 ) {
869         return true;
870     }
871     return false;
872 }
873 
874 //  ----------------------------------------------------------------------------
875 bool
876 CFormatGuess::TestFormatTextAsn(
877     EMode /* not used */ )
878 {
879     if ( ! EnsureStats() ) {
880         return false;
881     }
882 
883     // reject obvious misfits:
884     if ( m_iTestDataSize == 0 || m_pTestBuffer[0] == '>' ) {
885         return false;
886     }
887 
888     // criteria:
889     // at least 80% text-ish,
890     // "::=" as the 2nd field of the first non-blank non comment line.
891     //
892     double dAlNumFraction =  (double)m_iStatsCountAlNumChars / m_iTestDataSize;
893     if ( dAlNumFraction < 0.80 ) {
894         return false;
895     }
896 
897     CNcbiIstrstream TestBuffer(
898         reinterpret_cast<const char*>( m_pTestBuffer ), m_iTestDataSize );
899     string strLine;
900 
901     while ( ! TestBuffer.fail() ) {
902         vector<string> Fields;
903         NcbiGetline( TestBuffer, strLine, "\n\r" );
904         NStr::Tokenize( strLine, " \t", Fields, NStr::eMergeDelims );
905         if ( IsAsnComment( Fields  ) ) {
906             continue;
907         }
908         return ( Fields.size() >= 2 && Fields[1] == "::=" );
909     }
910     return false;
911 }
912 
913 //  -----------------------------------------------------------------------------
914 bool
915 CFormatGuess::TestFormatTaxplot(
916     EMode /* not used */ )
917 {
918     return false;
919 }
920 
921 //  -----------------------------------------------------------------------------
922 bool
923 CFormatGuess::TestFormatSnpMarkers(
924     EMode /* not used */ )
925 {
926     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
927         return false;
928     }
929     ITERATE( list<string>, it, m_TestLines ) {
930         string str = *it;
931         int rsid, chr, pos, numMatched;
932         numMatched = sscanf( it->c_str(), "rs%d\t%d\t%d", &rsid, &chr, &pos);
933         if ( numMatched == 3) {
934             return true;
935         }
936     }
937     return false;  
938 }
939 
940 
941 //  ----------------------------------------------------------------------------
942 bool
943 CFormatGuess::TestFormatBed(
944     EMode /* not used */ )
945 {
946     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
947         return false;
948     }
949 
950     bool bTrackLineFound( false );    
951     size_t columncount = 0;
952     ITERATE( list<string>, it, m_TestLines ) {
953         string str = NStr::TruncateSpaces( *it );
954         if ( str.empty() ) {
955             continue;
956         }
957 
958         //
959         //  while occurrence of the following decorations _is_ a good sign, they could
960         //  also be indicator for a variety of other UCSC data formats
961         //
962         if ( NStr::StartsWith( str, "track" ) ) {
963             bTrackLineFound = true;
964             continue;
965         }
966         if ( NStr::StartsWith( str, "browser" ) ) {
967             continue;
968         }
969         if ( NStr::StartsWith( str, "#" ) ) {
970             continue;
971         }
972 
973         vector<string> columns;
974         NStr::Tokenize( str, " \t", columns, NStr::eMergeDelims );
975         if (columns.size() < 3 || columns.size() > 12) {
976             return false;
977         }
978         if ( columns.size() != columncount ) {
979             if ( columncount == 0 ) {
980                 columncount = columns.size();
981             }
982             else {
983                 return false;
984             }
985         }
986     }
987     return bTrackLineFound;
988 }
989 
990 //  ----------------------------------------------------------------------------
991 bool
992 CFormatGuess::TestFormatBed15(
993     EMode /* not used */ )
994 {
995     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
996         return false;
997     }
998 
999     size_t columncount = 15;
1000     ITERATE( list<string>, it, m_TestLines ) {
1001         if ( NStr::TruncateSpaces( *it ).empty() ) {
1002             continue;
1003         }
1004         //
1005         //  while occurrence of the following decorations _is_ a good sign, they could
1006         //  also be indicator for a variety of other UCSC data formats
1007         //
1008         if ( NStr::StartsWith( *it, "track" ) ) {
1009             continue;
1010         }
1011         if ( NStr::StartsWith( *it, "browser" ) ) {
1012             continue;
1013         }
1014         if ( NStr::StartsWith( *it, "#" ) ) {
1015             continue;
1016         }
1017 
1018         vector<string> columns;
1019         NStr::Tokenize( *it, " \t", columns, NStr::eMergeDelims );
1020         if ( columns.size() != columncount ) {
1021             return false;
1022         }
1023     }
1024     return true;
1025 }
1026 
1027 //  ----------------------------------------------------------------------------
1028 bool
1029 CFormatGuess::TestFormatWiggle(
1030     EMode /* not used */ )
1031 {
1032     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1033         return false;
1034     }
1035     ITERATE( list<string>, it, m_TestLines ) {
1036         if ( NStr::StartsWith( *it, "track" ) ) {
1037             if ( NStr::Find( *it, "type=wiggle_0" ) != NPOS ) {
1038                 return true;
1039             }
1040             if ( NStr::Find( *it, "type=bedGraph" ) != NPOS ) {
1041                 return true;
1042             }
1043         }
1044     }
1045     return false;
1046 }
1047 
1048 
1049 //  ----------------------------------------------------------------------------
1050 bool CFormatGuess::IsInputRepeatMaskerWithHeader()
1051 {
1052     //
1053     //  Repeatmasker files consist of columnar data with a couple of lines
1054     //  of column labels prepended to it (but sometimes someone strips those
1055     //  labels).
1056     //  This function tries to identify repeatmasker data by those column
1057     //  label lines. They should be the first non-blanks in the file.
1058     //
1059     string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
1060     string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
1061 
1062     //
1063     //  Purge junk lines:
1064     //
1065     list<string>::iterator it = m_TestLines.begin();
1066     for  ( ; it != m_TestLines.end(); ++it ) {
1067         NStr::TruncateSpacesInPlace( *it );
1068         if ( *it != "" ) {
1069             break;
1070         }
1071     }
1072 
1073     if ( it == m_TestLines.end() ) {
1074         return false;
1075     }
1076 
1077     //
1078     //  Verify first line of labels:
1079     //
1080     size_t current_offset = 0;
1081     for ( size_t i=0; labels_1st_line[i] != ""; ++i ) {
1082         current_offset = NStr::FindCase( *it, labels_1st_line[i], current_offset );
1083         if ( current_offset == NPOS ) {
1084             return false;
1085         }
1086     }
1087 
1088     //
1089     //  Verify second line of labels:
1090     //
1091     ++it;
1092     if ( it == m_TestLines.end() ) {
1093         return false;
1094     }
1095     current_offset = 0;
1096     for ( size_t j=0; labels_2nd_line[j] != ""; ++j ) {
1097         current_offset = NStr::FindCase( *it, labels_2nd_line[j], current_offset );
1098         if ( current_offset == NPOS ) {
1099             return false;
1100         }
1101     }
1102 
1103     //
1104     //  Should have at least one extra line:
1105     //
1106     ++it;
1107     if ( it == m_TestLines.end() ) {
1108         return false;
1109     }
1110 
1111     return true;
1112 }
1113 
1114 
1115 //  ----------------------------------------------------------------------------
1116 bool CFormatGuess::IsInputRepeatMaskerWithoutHeader()
1117 {
1118     //
1119     //  Repeatmasker files consist of columnar data with a couple of lines
1120     //  of column labels prepended to it (but sometimes someone strips those
1121     //  labels).
1122     //  This function assumes the column labels have been stripped and attempts
1123     //  to identify RMO by checking the data itself.
1124     //
1125 
1126     //
1127     //  We declare the data as RMO if we are able to parse every record in the
1128     //  sample we got:
1129     //
1130     ITERATE( list<string>, it, m_TestLines ) {
1131         string str = NStr::TruncateSpaces( *it );
1132         if ( str == "" ) {
1133             continue;
1134         }
1135         if ( ! IsLineRmo( str ) ) {
1136             return false;
1137         }
1138     }
1139 
1140     return true;
1141 }
1142 
1143 
1144 //  ----------------------------------------------------------------------------
1145 bool
1146 CFormatGuess::IsLineNewick(
1147     const string& cline )
1148 {
1149     //
1150     //  Note:
1151     //  Newick lines are a little tricky. They contain tree structure of the form
1152     //  (a,b), where each a or be can either be a another tree structure, or a
1153     //  label of the form 'ABCD'. The trickiness comes from the fact that these
1154     //  beasts are highly recursive, to the point that our 1k read buffer may not
1155     //  even cover a single line in the file. Which means, we might only have a
1156     //  partial line to work with.
1157     //
1158     //  The test:
1159     //  Throw away all the labels, i.e. everything between an odd-numbered ' and
1160     //  an even numbered tick. After that, there should only remain '(', ')', ';',
1161     //  ''', ',', or whitespace.
1162     //  Moreover, if there is a semicolon, it must be at the end of the line.
1163     //
1164     string line = NStr::TruncateSpaces( cline );
1165     if ( line.empty() ) {
1166         return false;
1167     }
1168     string delimiters = " ,();";
1169     for ( size_t i=0; line[i] != 0; ++i ) {
1170 
1171         if ( NPOS != delimiters.find( line[i] ) ) {
1172             if ( line[i] == ';' && i != line.size() - 1 ) {
1173                 return false;
1174             }
1175             else {
1176                 continue;
1177             }
1178         }
1179         if ( line[i] == '[' || line[i] == ']' ) {
1180             return false;
1181         }
1182         size_t label_end = line.find_first_of( delimiters, i );
1183         string label = line.substr( i, label_end - i );
1184         if ( ! IsLabelNewick( label ) ) {
1185             return false;
1186         }
1187         if ( NPOS == label_end ) {
1188             return true;
1189         }
1190         i = label_end;
1191     }
1192     return true;
1193 }
1194 
1195 
1196 //  ----------------------------------------------------------------------------
1197 bool CFormatGuess::IsLineFlatFileSequence(
1198     const string& line )
1199 {
1200     // blocks of ten residues (or permitted punctuation characters)
1201     // with a count at the start or end; require at least four
1202     // (normally six)
1203     SIZE_TYPE pos = line.find_first_not_of("0123456789 \t");
1204     if (pos == NPOS  ||  pos + 45 >= line.size()) {
1205         return false;
1206     }
1207 
1208     for (SIZE_TYPE i = 0;  i < 45;  ++i) {
1209         char c = line[pos + i];
1210         if (i % 11 == 10) {
1211             if ( !isspace(c) ) {
1212                 return false;
1213             }
1214         } else {
1215             if ( !isalpha(c)  &&  c != '-'  &&  c != '*') {
1216                 return false;
1217             }
1218         }
1219     }
1220 
1221     return true;
1222 }
1223 
1224 
1225 //  ----------------------------------------------------------------------------
1226 bool CFormatGuess::IsLabelNewick(
1227     const string& label )
1228 {
1229     //  Starts with a string of anything other than "[]:", optionally followed by
1230     //  a single ':', followed by a number, optionally followed by a dot and
1231     //  another number.
1232     if ( NPOS != label.find_first_of( "[]" ) ) {
1233         return false;
1234     }
1235     size_t colon = label.find( ':' );
1236     if ( NPOS == colon ) {
1237         return true;
1238     }
1239     size_t dot = label.find_first_not_of( "0123456789", colon + 1 );
1240     if ( NPOS == dot ) {
1241         return true;
1242     }
1243     if ( label[ dot ] != '.' ) {
1244         return false;
1245     }
1246     size_t end = label.find_first_not_of( "0123456789", dot + 1 );
1247     return ( NPOS == end );
1248 }
1249 
1250 
1251 //  ----------------------------------------------------------------------------
1252 bool CFormatGuess::IsLineAgp( 
1253     const string& strLine )
1254 {
1255     //
1256     //  Note: The reader allows for line and endline comments starting with a '#'.
1257     //  So we accept them here, too.
1258     //
1259     string line( strLine );
1260     size_t uCommentStart = NStr::Find( line, "#" );
1261 
1262     if ( NPOS != uCommentStart ) {
1263         line = line.substr( 0, uCommentStart );
1264     }
1265     NStr::TruncateSpacesInPlace( line );
1266     if ( line.empty() ) {
1267         return true;
1268     }
1269 
1270     vector<string> tokens;
1271     if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
1272         return false;
1273     }
1274 
1275     if ( tokens[1].size() > 1 && tokens[1][0] == '-' ) {
1276         tokens[1][0] = '1';
1277     }
1278     if ( -1 == NStr::StringToNumeric( tokens[1] ) ) {
1279         return false;
1280     }
1281 
1282     if ( tokens[2].size() > 1 && tokens[2][0] == '-' ) {
1283         tokens[2][0] = '1';
1284     }
1285     if ( -1 == NStr::StringToNumeric( tokens[2] ) ) {
1286         return false;
1287     }
1288 
1289     if ( tokens[3].size() > 1 && tokens[3][0] == '-' ) {
1290         tokens[3][0] = '1';
1291     }
1292     if ( -1 == NStr::StringToNumeric( tokens[3] ) ) {
1293         return false;
1294     }
1295 
1296     if ( tokens[4].size() != 1 || NPOS == tokens[4].find_first_of( "ADFGPNOW" ) ) {
1297         return false;
1298     }
1299     if ( tokens[4] == "N" ) {
1300         if ( -1 == NStr::StringToNumeric( tokens[5] ) ) {
1301             return false;
1302         }
1303     }
1304     else {
1305         if ( -1 == NStr::StringToNumeric( tokens[6] ) ) {
1306             return false;
1307         }
1308         if ( -1 == NStr::StringToNumeric( tokens[7] ) ) {
1309             return false;
1310         }            
1311         if ( tokens.size() != 9 ) {
1312             return false;
1313         }
1314         if ( tokens[8].size() != 1 || NPOS == tokens[8].find_first_of( "+-" ) ) {
1315             return false;
1316         }
1317     }
1318 
1319     return true;
1320 }
1321 
1322 
1323 //  ----------------------------------------------------------------------------
1324 bool CFormatGuess::IsLineGlimmer3(
1325     const string& line )
1326 {
1327     list<string> toks;
1328     NStr::Split(line, "\t ", toks);
1329     if (toks.size() != 5) {
1330         return false;
1331     }
1332 
1333     list<string>::iterator i = toks.begin();
1334 
1335     /// first column: skip (ascii identifier)
1336     ++i;
1337 
1338     /// second, third columns: both ints
1339     if ( ! s_IsTokenInteger( *i++ ) ) {
1340         return false;
1341     }
1342     if ( ! s_IsTokenInteger( *i++ ) ) {
1343         return false;
1344     }
1345 
1346     /// fourth column: int in the range of -3...3
1347     if ( ! s_IsTokenInteger( *i ) ) {
1348         return false;
1349     }
1350     int frame = NStr::StringToInt( *i++ );
1351     if (frame < -3  ||  frame > 3) {
1352         return false;
1353     }
1354 
1355     /// fifth column: score; double
1356     if ( ! s_IsTokenDouble( *i ) ) {
1357         return false;
1358     }
1359 
1360     return true;
1361 }
1362 
1363 
1364 //  ----------------------------------------------------------------------------
1365 bool CFormatGuess::IsLineGtf(
1366     const string& line )
1367 {
1368     vector<string> tokens;
1369     if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
1370         return false;
1371     }
1372     if ( ! s_IsTokenPosInt( tokens[3] ) ) {
1373         return false;
1374     }
1375     if ( ! s_IsTokenPosInt( tokens[4] ) ) {
1376         return false;
1377     }
1378     if ( ! s_IsTokenDouble( tokens[5] ) ) {
1379         return false;
1380     }
1381     if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
1382         return false;
1383     }
1384     if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
1385         return false;
1386     }
1387     return true;
1388 }
1389 
1390 
1391 //  ----------------------------------------------------------------------------
1392 bool CFormatGuess::IsLinePhrapId(
1393     const string& line )
1394 {
1395     vector<string> values;
1396     if ( NStr::Tokenize( line, " \t", values, NStr::eMergeDelims ).empty() ) {
1397         return false;
1398     }
1399 
1400     //
1401     //  Old style: "^DNA \\w+ "
1402     //
1403     if ( values[0] == "DNA" ) {
1404         return true;
1405     }
1406 
1407     //
1408     //  New style: "^AS [0-9]+ [0-9]+"
1409     //
1410     if ( values[0] == "AS" ) {
1411         return ( 0 <= NStr::StringToNumeric( values[1] ) &&
1412           0 <= NStr::StringToNumeric( values[2] ) );
1413     }
1414 
1415     return false;
1416 }
1417 
1418 
1419 //  ----------------------------------------------------------------------------
1420 bool CFormatGuess::IsLineRmo(
1421     const string& line )
1422 {
1423     const size_t MIN_VALUES_PER_RECORD = 15;
1424 
1425     //
1426     //  Make sure there is enough stuff on that line:
1427     //
1428     list<string> values;
1429     if ( NStr::Split( line, " \t", values ).size() < MIN_VALUES_PER_RECORD ) {
1430         return false;
1431     }
1432 
1433     //
1434     //  Look at specific values and make sure they are of the correct type:
1435     //
1436 
1437     //  1: positive integer:
1438     list<string>::iterator it = values.begin();
1439     if ( ! s_IsTokenPosInt( *it ) ) {
1440         return false;
1441     }
1442 
1443     //  2: float:
1444     ++it;
1445     if ( ! s_IsTokenDouble( *it ) ) {
1446         return false;
1447     }
1448 
1449     //  3: float:
1450     ++it;
1451     if ( ! s_IsTokenDouble( *it ) ) {
1452         return false;
1453     }
1454 
1455     //  4: float:
1456     ++it;
1457     if ( ! s_IsTokenDouble( *it ) ) {
1458         return false;
1459     }
1460 
1461     //  5: string, not checked
1462     ++it;
1463 
1464     //  6: positive integer:
1465     ++it;
1466     if ( ! s_IsTokenPosInt( *it ) ) {
1467         return false;
1468     }
1469 
1470     //  7: positive integer:
1471     ++it;
1472     if ( ! s_IsTokenPosInt( *it ) ) {
1473         return false;
1474     }
1475 
1476     //  8: positive integer, likely in paretheses, not checked:
1477     ++it;
1478 
1479     //  9: '+' or 'C':
1480     ++it;
1481     if ( *it != "+" && *it != "C" ) {
1482         return false;
1483     }
1484 
1485     //  and that's enough for now. But there are at least two more fields 
1486     //  with values that look testable.
1487 
1488     return true;
1489 }
1490 
1491 
1492 //  ----------------------------------------------------------------------------
1493 bool
1494 CFormatGuess::IsAsnComment(
1495     const vector<string>& Fields )
1496 {
1497     if ( Fields.size() == 0 ) {
1498         return true;
1499     }
1500     return ( NStr::StartsWith( Fields[0], "--" ) );
1501 }
1502 
1503 //  ----------------------------------------------------------------------------
1504 bool
1505 CFormatGuess::EnsureSplitLines()
1506 //  ----------------------------------------------------------------------------
1507 {
1508     if ( m_bSplitDone ) {
1509         return !m_TestLines.empty();
1510     }
1511     m_bSplitDone = true;
1512 
1513     //
1514     //  Make sure the given data is ASCII before checking potential line breaks:
1515     //
1516     const size_t MIN_HIGH_RATIO = 20;
1517     size_t high_count = 0;
1518     for ( streamsize i=0; i < m_iTestDataSize; ++i ) {
1519         if ( 0x80 & m_pTestBuffer[i] ) {
1520             ++high_count;
1521         }
1522     }
1523     if ( 0 < high_count && m_iTestDataSize / high_count < MIN_HIGH_RATIO ) {
1524         return false;
1525     }
1526 
1527     //
1528     //  Let's expect at least one line break in the given data:
1529     //
1530     string data( m_pTestBuffer, m_iTestDataSize );
1531 
1532     m_TestLines.clear();
1533     if ( NStr::Split( data, "\r\n", m_TestLines ).size() <= 1 ) {
1534         m_TestLines.clear();
1535         if ( NStr::Split( data, "\r", m_TestLines ).size() <= 1 ) {
1536             m_TestLines.clear();
1537             NStr::Split( data, "\n", m_TestLines );
1538         }
1539     }
1540     if ( (m_TestLines.size() <= 1) && (m_iTestDataSize == s_iTestBufferSize) ) {
1541         //
1542         //  single truncated line...
1543         //
1544         m_TestLines.clear();
1545         return false;
1546     }
1547 
1548     if ( m_iTestDataSize == s_iTestBufferSize ) {
1549         m_TestLines.pop_back();
1550     }
1551     return !m_TestLines.empty();
1552 }
1553 
1554 END_NCBI_SCOPE
1555 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.