NCBI C++ Toolkit Cross Reference

  C++/src/util/format_guess.cpp


  1 /*  $Id: format_guess.cpp 57368 2013-03-01 14:47:09Z falkrb $
  2  * ===========================================================================
  3  *
  4  *                            PUBLIC DOMAIN NOTICE
  5  *               National Center for Biotechnology Information
  6  *
  7  *  This software/database is a "United States Government Work" under the
  8  *  terms of the United States Copyright Act.  It was written as part of
  9  *  the author's official duties as a United States Government employee and
 10  *  thus cannot be copyrighted.  This software/database is freely available
 11  *  to the public for use. The National Library of Medicine and the U.S.
 12  *  Government have not placed any restriction on its use or reproduction.
 13  *
 14  *  Although all reasonable efforts have been taken to ensure the accuracy
 15  *  and reliability of the software and data, the NLM and the U.S.
 16  *  Government do not and cannot warrant the performance or results that
 17  *  may be obtained by using this software or data. The NLM and the U.S.
 18  *  Government disclaim all warranties, express or implied, including
 19  *  warranties of performance, merchantability or fitness for any particular
 20  *  purpose.
 21  *
 22  *  Please cite the author in any work or product based on this material.
 23  *
 24  * ===========================================================================
 25  *
 26  * Author: Anatoliy Kuznetsov
 27  *
 28  * File Description:  Implemented methods to identify file formats.
 29  *
 30  */
 31 
 32 #include <ncbi_pch.hpp>
 33 #include <util/format_guess.hpp>
 34 #include <util/util_exception.hpp>
 35 #include <corelib/ncbifile.hpp>
 36 #include <corelib/ncbistre.hpp>
 37 #include <corelib/stream_utils.hpp>
 38 
 39 BEGIN_NCBI_SCOPE
 40 
 41 enum ESymbolType {
 42     fDNA_Main_Alphabet  = 1<<0, ///< Just ACGTUN-.
 43     fDNA_Ambig_Alphabet = 1<<1, ///< Anything else representable in ncbi4na.
 44     fProtein_Alphabet   = 1<<2, ///< Allows BZX*-, but not JOU.
 45     fLineEnd            = 1<<3,
 46     fAlpha              = 1<<4,
 47     fDigit              = 1<<5,
 48     fSpace              = 1<<6,
 49     fInvalid            = 1<<7
 50 };
 51 
 52 enum EConfidence {
 53     eNo = 0,
 54     eMaybe,
 55     eYes
 56 };
 57 
 58 
 59 //  ============================================================================
 60 //  Helper routine--- file scope only:
 61 //  ============================================================================
 62 
 63 static unsigned char symbol_type_table[256];
 64 
 65 //  ----------------------------------------------------------------------------
 66 static bool s_IsTokenPosInt(
 67     const string& strToken )
 68 {
 69     return ( -1 != NStr::StringToNonNegativeInt( strToken ) );
 70 }
 71 
 72 //  ----------------------------------------------------------------------------
 73 static bool s_IsTokenInteger(
 74     const string& strToken )
 75 //  ----------------------------------------------------------------------------
 76 {
 77     if ( ! strToken.empty() && strToken[0] == '-' ) {
 78         return s_IsTokenPosInt( strToken.substr( 1 ) );
 79     }
 80     return s_IsTokenPosInt( strToken );
 81 }
 82 
 83 //  ----------------------------------------------------------------------------
 84 static bool s_IsTokenDouble(
 85     const string& strToken )
 86 {
 87     string token( strToken );
 88     NStr::ReplaceInPlace( token, ".", "1", 0, 1 );
 89     if ( token.size() > 1 && token[0] == '-' ) {
 90         token[0] = '1';
 91     }
 92     return s_IsTokenPosInt( token );
 93 }
 94 
 95 //  ----------------------------------------------------------------------------
 96 static void init_symbol_type_table(void)
 97 {
 98     if ( symbol_type_table[0] == 0 ) {
 99         for ( const char* s = "ACGNTU"; *s; ++s ) {
100             unsigned char c = *s;
101             symbol_type_table[c] |= fDNA_Main_Alphabet;
102             c = tolower(c);
103             symbol_type_table[c] |= fDNA_Main_Alphabet;
104         }
105         for ( const char* s = "BDHKMRSVWY"; *s; ++s ) {
106             unsigned char c = *s;
107             symbol_type_table[c] |= fDNA_Ambig_Alphabet;
108             c = tolower(c);
109             symbol_type_table[c] |= fDNA_Ambig_Alphabet;
110         }
111         for ( const char* s = "ACDEFGHIKLMNPQRSTVWYBZX"; *s; ++s ) {
112             unsigned char c = *s;
113             symbol_type_table[c] |= fProtein_Alphabet;
114             c = tolower(c);
115             symbol_type_table[c] |= fProtein_Alphabet;
116         }
117         symbol_type_table[(unsigned char)'-']
118             |= fDNA_Main_Alphabet | fProtein_Alphabet;
119         symbol_type_table[(unsigned char)'*'] |= fProtein_Alphabet;
120         for ( const char* s = "\r\n"; *s; ++s ) {
121             unsigned char c = *s;
122             symbol_type_table[c] |= fLineEnd;
123         }
124         for ( int c = 1; c < 256; ++c ) {
125             if ( isalpha(c) )
126                 symbol_type_table[c] |= fAlpha;
127             if ( isdigit(c) )
128                 symbol_type_table[c] |= fDigit;
129             if ( isspace(c) )
130                 symbol_type_table[c] |= fSpace;
131         }
132         symbol_type_table[0] |= fInvalid;
133     }
134 }
135 
136 //  ----------------------------------------------------------------------------
137 int
138 CFormatGuess::s_CheckOrder[] =
139 //  ----------------------------------------------------------------------------
140 {
141     //  must list all EFormats except eUnknown and eFormat_max. Will cause
142     //  assertion if violated!
143     //
144     eBam, // must precede eGZip!
145     eZip,
146     eGZip,
147     eBZip2,
148     eLzo,
149     eSra,
150     eRmo,
151     eVcf,
152     eGtf,
153     eGvf,
154     eGff3,
155     eGff2,
156     eGlimmer3,
157     eAgp,
158     eXml,
159     eWiggle,
160     eBed,
161     eBed15,
162     eNewick,
163     eHgvs,
164     eAlignment,
165     eDistanceMatrix,
166     eFlatFileSequence,
167     eFiveColFeatureTable,
168     eSnpMarkers,
169     eFasta,
170     eTextASN,
171     eTaxplot,
172     ePhrapAce,
173     eTable,
174     eBinaryASN,
175 };
176 
177 
178 // This array must stay in sync with enum EFormat, but that's not
179 // supposed to change in the middle anyway, so the explicit size
180 // should suffice to avoid accidental skew.
181 const char* const CFormatGuess::sm_FormatNames[CFormatGuess::eFormat_max] = {
182     "unknown",
183     "binary ASN.1",
184     "RepeatMasker",
185     "GFF/GTF Poisoned",
186     "Glimmer3",
187     "AGP",
188     "XML",
189     "WIGGLE",
190     "BED",
191     "BED15",
192     "Newick",
193     "alignment",
194     "distance matrix",
195     "flat-file sequence",
196     "five-column feature table",
197     "SNP Markers",
198     "FASTA",
199     "text ASN.1",
200     "Taxplot",
201     "Phrap ACE",
202     "table",
203     "GTF",
204     "GFF3",
205     "GFF2",
206     "HGVS",
207     "GVF",
208     "zip",
209     "gzip",
210     "bzip2",
211     "lzo",
212     "SRA",
213     "BAM",
214     "VCF",
215 };
216 
217 const char*
218 CFormatGuess::GetFormatName(EFormat format)
219 {
220     unsigned int i = static_cast<unsigned int>(format);
221     if (i >= static_cast <unsigned int>(eFormat_max)) {
222         NCBI_THROW(CUtilException, eWrongData,
223                    "CFormatGuess::GetFormatName: out-of-range format value "
224                    + NStr::IntToString(i));
225     }
226     return sm_FormatNames[i];
227 }
228 
229 
230 //  ============================================================================
231 //  Old style class interface:
232 //  ============================================================================
233 
234 //  ----------------------------------------------------------------------------
235 CFormatGuess::ESequenceType
236 CFormatGuess::SequenceType(const char* str, unsigned length,
237                            ESTStrictness strictness)
238 {
239     if (length == 0)
240         length = (unsigned)::strlen(str);
241 
242     init_symbol_type_table();
243     unsigned int main_nuc_content = 0, ambig_content = 0, bad_nuc_content = 0,
244         amino_acid_content = 0, exotic_aa_content = 0, bad_aa_content = 0;
245 
246     for (unsigned i = 0; i < length; ++i) {
247         unsigned char c = str[i];
248         unsigned char type = symbol_type_table[c];
249         if ( type & fDNA_Main_Alphabet ) {
250             ++main_nuc_content;
251         } else if ( type & fDNA_Ambig_Alphabet ) {
252             ++ambig_content;
253         } else if ( !(type & (fSpace | fDigit)) ) {
254             ++bad_nuc_content;
255         }
256 
257         if ( type & fProtein_Alphabet ) {
258             ++amino_acid_content;
259         } else if ( type & fAlpha ) {
260             ++exotic_aa_content;
261         } else if ( !(type & (fSpace | fDigit)) ) {
262             ++bad_aa_content;
263         }
264     }
265 
266     switch (strictness) {
267     case eST_Lax:
268     {
269         double dna_content = (double)main_nuc_content / (double)length;
270         double prot_content = (double)amino_acid_content / (double)length;
271 
272         if (dna_content > 0.7) {
273             return eNucleotide;
274         }
275         if (prot_content > 0.7) {
276             return eProtein;
277         }
278     }
279 
280     case eST_Default:
281         if (bad_nuc_content + ambig_content <= main_nuc_content / 9
282             ||  (bad_nuc_content + ambig_content <= main_nuc_content / 3  &&
283                  bad_nuc_content <= (main_nuc_content + ambig_content) / 19)) {
284             // >=90% main alph. (ACGTUN-) or >=75% main and >=95% 4na-encodable
285             return eNucleotide;
286         } else if (bad_aa_content + exotic_aa_content
287                    <= amino_acid_content / 9) {
288             // >=90% relatively standard protein residues.  (JOU don't count.)
289             return eProtein;
290         }
291 
292     case eST_Strict: // Must be 100% encodable
293         if (bad_nuc_content == 0  &&  ambig_content <= main_nuc_content / 3) {
294             return eNucleotide;
295         } else if (bad_aa_content == 0
296                    &&  exotic_aa_content <= amino_acid_content / 9) {
297             return eProtein;
298         }
299     }
300 
301     return eUndefined;
302 }
303 
304 
305 //  ----------------------------------------------------------------------------
306 CFormatGuess::EFormat CFormatGuess::Format(const string& path, EOnError onerror)
307 {
308     CNcbiIfstream input(path.c_str(), IOS_BASE::in | IOS_BASE::binary);
309     return Format(input);
310 }
311 
312 //  ----------------------------------------------------------------------------
313 CFormatGuess::EFormat CFormatGuess::Format(CNcbiIstream& input, EOnError onerror)
314 {
315     CFormatGuess FG( input );
316     return FG.GuessFormat( onerror );
317 }
318 
319 
320 //  ============================================================================
321 //  New style object interface:
322 //  ============================================================================
323 
324 //  ----------------------------------------------------------------------------
325 CFormatGuess::CFormatGuess()
326     : m_Stream( * new CNcbiIfstream )
327     , m_bOwnsStream( true )
328 {
329     Initialize();
330 }
331 
332 //  ----------------------------------------------------------------------------
333 CFormatGuess::CFormatGuess(
334     const string& FileName )
335     : m_Stream( * new CNcbiIfstream( FileName.c_str() ) )
336     , m_bOwnsStream( true )
337 {
338     Initialize();
339 }
340 
341 //  ----------------------------------------------------------------------------
342 CFormatGuess::CFormatGuess(
343     CNcbiIstream& Stream )
344     : m_Stream( Stream )
345     , m_bOwnsStream( false )
346 {
347     Initialize();
348 }
349 
350 //  ----------------------------------------------------------------------------
351 CFormatGuess::~CFormatGuess()
352 {
353     delete[] m_pTestBuffer;
354     if ( m_bOwnsStream ) {
355         delete &m_Stream;
356     }
357 }
358 
359 //  ----------------------------------------------------------------------------
360 CFormatGuess::EFormat
361 CFormatGuess::GuessFormat( EMode )
362 {
363     return GuessFormat(eDefault);
364 }
365 
366 //  ----------------------------------------------------------------------------
367 CFormatGuess::EFormat
368 CFormatGuess::GuessFormat(
369     EOnError onerror )
370 {
371     if (!x_TestInput(m_Stream, onerror)) {
372         return eUnknown;
373     }
374     EMode mode = eQuick;
375     unsigned int uFormatCount = sizeof( s_CheckOrder ) / sizeof( int );
376 
377     // First, try to use hints
378     if ( !m_Hints.IsEmpty() ) {
379         for (unsigned int f = 0; f < uFormatCount; ++f) {
380             EFormat fmt = EFormat( s_CheckOrder[ f ] );
381             if (m_Hints.IsPreferred(fmt)  &&  x_TestFormat(fmt, mode)) {
382                 return fmt;
383             }
384         }
385     }
386 
387     // Check other formats, skip the ones that are disabled through hints
388     for (unsigned int f = 0; f < uFormatCount; ++f) {
389         EFormat fmt = EFormat( s_CheckOrder[ f ] );
390         if ( ! m_Hints.IsDisabled(fmt)  &&  x_TestFormat(fmt, mode) ) {
391             return fmt;
392         }
393     }
394     return eUnknown;
395 }
396 
397 //  ----------------------------------------------------------------------------
398 bool
399 CFormatGuess::TestFormat( EFormat format, EMode )
400 {
401     return TestFormat( format, eDefault);
402 }
403 
404 //  ----------------------------------------------------------------------------
405 bool
406 CFormatGuess::TestFormat(
407     EFormat format,
408     EOnError onerror )
409 {
410     if (format != eUnknown && !x_TestInput(m_Stream, onerror)) {
411         return false;
412     }
413     EMode mode = eQuick;
414     return x_TestFormat(format, mode);
415 }
416 
417 //  ----------------------------------------------------------------------------
418 bool CFormatGuess::x_TestFormat(EFormat format, EMode mode)
419 {
420     // First check if the format is disabled
421     if ( m_Hints.IsDisabled(format) ) {
422         return false;
423     }
424 
425     switch( format ) {
426 
427     case eBinaryASN:
428         return TestFormatBinaryAsn( mode );
429     case eRmo:
430         return TestFormatRepeatMasker( mode );
431     case eGtf:
432         return TestFormatGtf( mode );
433     case eGvf:
434                 return TestFormatGvf( mode );
435         case eGff3:
436         return TestFormatGff3( mode );
437     case eGff2:
438         return TestFormatGff2( mode );
439     case eGlimmer3:
440         return TestFormatGlimmer3( mode );
441     case eAgp:
442         return TestFormatAgp( mode );
443     case eXml:
444         return TestFormatXml( mode );
445     case eWiggle:
446         return TestFormatWiggle( mode );
447     case eBed:
448         return TestFormatBed( mode );
449     case eBed15:
450         return TestFormatBed15( mode );
451     case eNewick:
452         return TestFormatNewick( mode );
453     case eAlignment:
454         return TestFormatAlignment( mode );
455     case eDistanceMatrix:
456         return TestFormatDistanceMatrix( mode );
457     case eFlatFileSequence:
458         return TestFormatFlatFileSequence( mode );
459     case eFiveColFeatureTable:
460         return TestFormatFiveColFeatureTable( mode );
461     case eSnpMarkers:
462         return TestFormatSnpMarkers( mode );
463     case eFasta:
464         return TestFormatFasta( mode );
465     case eTextASN:
466         return TestFormatTextAsn( mode );
467     case eTaxplot:
468         return TestFormatTaxplot( mode );
469     case ePhrapAce:
470         return TestFormatPhrapAce( mode );
471     case eTable:
472         return TestFormatTable( mode );
473     case eHgvs:
474         return TestFormatHgvs( mode );
475     case eZip:
476         return TestFormatZip( mode );
477     case eGZip:
478         return TestFormatGZip( mode );
479     case eBZip2:
480         return TestFormatBZip2( mode );
481     case eLzo:
482         return TestFormatLzo( mode );
483     case eSra:
484         return TestFormatSra( mode );
485     case eBam:
486         return TestFormatBam( mode );
487     case eVcf:
488         return TestFormatVcf( mode );
489     default:
490         NCBI_THROW( CCoreException, eInvalidArg,
491             "CFormatGuess::x_TestFormat(): Unsupported format ID." );
492     }
493 }
494 
495 //  ----------------------------------------------------------------------------
496 void
497 CFormatGuess::Initialize()
498 {
499     NCBI_ASSERT(eFormat_max-2 == sizeof( s_CheckOrder ) / sizeof( int ),
500         "Indices in s_CheckOrder do not match format count ---"
501         "update s_CheckOrder to list all formats" 
502     );
503     NCBI_ASSERT(eFormat_max == sizeof(sm_FormatNames) / sizeof(const char*)
504                 &&  sm_FormatNames[eFormat_max - 1] != NULL,
505                 "sm_FormatNames does not list all possible formats");
506     m_pTestBuffer = 0;
507 
508     m_bStatsAreValid = false;
509     m_bSplitDone = false;
510     m_iStatsCountData = 0;
511     m_iStatsCountAlNumChars = 0;
512     m_iStatsCountDnaChars = 0;
513     m_iStatsCountAaChars = 0;
514 }
515 
516 //  ----------------------------------------------------------------------------
517 bool
518 CFormatGuess::EnsureTestBuffer()
519 {
520     if ( m_pTestBuffer ) {
521         return true;
522     }
523     if ( ! m_Stream.good() ) {
524         return false;
525     }
526 
527     // Fix to the all-comment problem.
528     // Read a test buffer,
529     // Test it for being all comment
530     // If its all comment, read a twice as long buffer
531     // Stop when its no longer all comment, end of the stream,
532     //   or Multiplier hits 1024 
533     int Multiplier = 1;
534     while(true) {
535         m_pTestBuffer = new char[ Multiplier * s_iTestBufferSize ];
536         m_Stream.read( m_pTestBuffer, Multiplier * s_iTestBufferSize );
537         m_iTestDataSize = m_Stream.gcount();
538         m_Stream.clear();  // in case we reached eof
539         CStreamUtils::Stepback( m_Stream, m_pTestBuffer, m_iTestDataSize );
540         
541         if (IsAllComment()) {
542             Multiplier *= 2;
543             delete [] m_pTestBuffer;
544             m_pTestBuffer = NULL;
545             if (Multiplier >= 1024 || m_iTestDataSize < ((Multiplier/2) * s_iTestBufferSize) )  {
546                 return false;
547             }
548             continue;
549         } else {
550             break;
551         }
552     }
553 
554     return true;
555 }
556 
557 //  ----------------------------------------------------------------------------
558 bool
559 CFormatGuess::EnsureStats()
560 {
561     if ( m_bStatsAreValid ) {
562         return true;
563     }
564     if ( ! EnsureTestBuffer() ) {
565         return false;
566     }
567     if ( m_iTestDataSize == 0 ) {
568         m_bStatsAreValid = true;
569         return true;
570     }
571 
572     CNcbiIstrstream TestBuffer(
573         reinterpret_cast<const char*>( m_pTestBuffer ), m_iTestDataSize );
574     string strLine;
575 
576     init_symbol_type_table();
577     // Things we keep track of:
578     //   m_iStatsCountAlNumChars: number of characters that are letters or
579     //     digits
580     //   m_iStatsCountData: number of characters not part of a line starting
581     //     with '>', ignoring whitespace
582     //   m_iStatsCountDnaChars: number of characters counted in m_iStatsCountData
583     //     from the DNA alphabet
584     //   m_iStatsCountAaChars: number of characters counted in m_iStatsCountData
585     //     from the AA alphabet
586     //
587     while ( ! TestBuffer.fail() ) {
588         NcbiGetlineEOL( TestBuffer, strLine );
589 // code in CFormatGuess::Format counts line ends
590 // so, we will count them here as well
591         if (!strLine.empty()) {
592             strLine += '\n';
593         }
594         size_t size = strLine.size();
595         bool is_header = size > 0 && strLine[0] == '>';
596         for ( size_t i=0; i < size; ++i ) {
597             unsigned char c = strLine[i];
598             unsigned char type = symbol_type_table[c];
599 
600             if ( type & (fAlpha | fDigit | fSpace) ) {
601                 ++m_iStatsCountAlNumChars;
602             }
603             if ( !is_header ) {
604                 if ( !(type & fSpace) ) {
605                     ++m_iStatsCountData;
606                 }
607 
608                 if ( type & fDNA_Main_Alphabet ) {
609                     ++m_iStatsCountDnaChars;
610                 }
611                 if ( type & fProtein_Alphabet ) {
612                     ++m_iStatsCountAaChars;
613                 }
614                 if ( type & fLineEnd ) {
615                     ++m_iStatsCountAlNumChars;
616                     --m_iStatsCountData;
617                 }
618             }
619         }
620     }
621     m_bStatsAreValid = true;
622     return true;
623 }
624 
625 //  ----------------------------------------------------------------------------
626 bool CFormatGuess::x_TestInput( CNcbiIstream& input, EOnError onerror )
627 {
628     if (!input) {
629         if (onerror == eThrowOnBadSource) {
630             NCBI_THROW(CUtilException,eNoInput,"Unreadable input stream");
631         }
632         return false;
633     }
634     return true;
635 }
636 
637 //  ----------------------------------------------------------------------------
638 bool
639 CFormatGuess::TestFormatRepeatMasker(
640     EMode /* not used */ )
641 {
642     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
643         return false;
644     }
645     return IsInputRepeatMaskerWithHeader() ||
646         IsInputRepeatMaskerWithoutHeader();
647 }
648 
649 //  ----------------------------------------------------------------------------
650 bool
651 CFormatGuess::TestFormatPhrapAce(
652     EMode /* not used */ )
653 {
654     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
655         return false;
656     }
657 
658     ITERATE( list<string>, it, m_TestLines ) {
659         if ( IsLinePhrapId( *it ) ) {
660             return true;
661         }
662     }
663     return false;
664 }
665 
666 //  -----------------------------------------------------------------------------
667 bool
668 CFormatGuess::TestFormatGtf(
669     EMode /* not used */ )
670 {
671     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
672         return false;
673     }
674 
675     unsigned int uGtfLineCount = 0;
676     list<string>::iterator it = m_TestLines.begin();
677 
678     for ( ;  it != m_TestLines.end();  ++it) {
679         //
680         //  Make sure to ignore any UCSC track and browser lines prior to the
681         //  start of data
682         //
683         if ( it->empty() || (*it)[0] == '#' ) {
684             continue;
685         }
686         if ( !uGtfLineCount && NStr::StartsWith( *it, "browser " ) ) {
687             continue;
688         }
689         if ( !uGtfLineCount && NStr::StartsWith( *it, "track " ) ) {
690             continue;
691         }
692         if ( ! IsLineGtf( *it ) ) {
693             return false;
694         }
695         ++uGtfLineCount;
696     }
697     return (uGtfLineCount != 0);
698 }
699 
700 //  -----------------------------------------------------------------------------
701 bool
702 CFormatGuess::TestFormatGvf(
703     EMode /* not used */ )
704 {
705     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
706         return false;
707     }
708 
709     unsigned int uGvfLineCount = 0;
710     list<string>::iterator it = m_TestLines.begin();
711 
712     for ( ;  it != m_TestLines.end();  ++it) {
713         //
714         //  Make sure to ignore any UCSC track and browser lines prior to the
715         //  start of data
716         //
717         if ( it->empty() || (*it)[0] == '#' ) {
718                         continue;
719                 }
720                 if ( !uGvfLineCount && NStr::StartsWith( *it, "browser " ) ) {
721             continue;
722         }
723         if ( !uGvfLineCount && NStr::StartsWith( *it, "track " ) ) {
724             continue;
725         }
726         if ( ! IsLineGvf( *it ) ) {
727             return false;
728         }
729         ++uGvfLineCount;
730     }
731     return (uGvfLineCount != 0);
732 }
733 
734 
735 //  -----------------------------------------------------------------------------
736 bool
737 CFormatGuess::TestFormatGff3(
738     EMode /* not used */ )
739 {
740     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
741         return false;
742     }
743 
744     unsigned int uGffLineCount = 0;
745     list<string>::iterator it = m_TestLines.begin();
746 
747     for ( ;  it != m_TestLines.end();  ++it) {
748         //
749         //  Make sure to ignore any UCSC track and browser lines prior to the
750         //  start of data
751         //
752         if ( it->empty() || (*it)[0] == '#' ) {
753             continue;
754         }
755         if ( !uGffLineCount && NStr::StartsWith( *it, "browser " ) ) {
756             continue;
757         }
758         if ( !uGffLineCount && NStr::StartsWith( *it, "track " ) ) {
759             continue;
760         }
761         if ( ! IsLineGff3( *it ) ) {
762             return false;
763         }
764         ++uGffLineCount;
765     }
766     return (uGffLineCount != 0);
767 }
768 
769 
770 //  -----------------------------------------------------------------------------
771 bool
772 CFormatGuess::TestFormatGff2(
773     EMode /* not used */ )
774 {
775     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
776         return false;
777     }
778 
779     unsigned int uGffLineCount = 0;
780     list<string>::iterator it = m_TestLines.begin();
781 
782     for ( ;  it != m_TestLines.end();  ++it) {
783         //
784         //  Make sure to ignore any UCSC track and browser lines prior to the
785         //  start of data
786         //
787         if ( it->empty() || (*it)[0] == '#' ) {
788             continue;
789         }
790         if ( !uGffLineCount && NStr::StartsWith( *it, "browser " ) ) {
791             continue;
792         }
793         if ( !uGffLineCount && NStr::StartsWith( *it, "track " ) ) {
794             continue;
795         }
796         if ( ! IsLineGff2( *it ) ) {
797             return false;
798         }
799         ++uGffLineCount;
800     }
801     return (uGffLineCount != 0);
802 }
803 
804 
805 //  -----------------------------------------------------------------------------
806 bool
807 CFormatGuess::TestFormatGlimmer3(
808     EMode /* not used */ )
809 {
810     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
811         return false;
812     }
813 
814     /// first line should be a FASTA defline
815     list<string>::iterator it = m_TestLines.begin();
816     if (it->empty()  ||  (*it)[0] != '>') {
817         return false;
818     }
819     
820     /// there should be additional data lines, and they should be easily parseable, 
821     ///  with five columns
822     ++it;
823     if (it == m_TestLines.end()) {
824         return false;
825     }
826     for ( /**/;  it != m_TestLines.end();  ++it) {
827         if ( !IsLineGlimmer3( *it ) ) {
828             return false;
829         }
830     }
831     return true;
832 }
833 
834 //  -----------------------------------------------------------------------------
835 bool
836 CFormatGuess::TestFormatAgp(
837     EMode /* not used */ )
838 {
839     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
840         return false;
841     }
842     ITERATE( list<string>, it, m_TestLines ) {
843         if ( !IsLineAgp( *it ) ) {
844             return false;
845         }
846     }
847     return true;
848 }
849 
850 //  -----------------------------------------------------------------------------
851 bool
852 CFormatGuess::TestFormatNewick(
853     EMode /* not used */ )
854 {
855 //  -----------------------------------------------------------------------------
856     //  special newick consideration:
857     //  newick files may come with all data cramped into a single run-on line,
858     //  that single oversized line may not have a line terminator
859     const size_t maxSampleSize = 8*1024-1;
860     size_t sampleSize = 0;
861     char* pSample = new char[maxSampleSize+1];
862     AutoArray<char> autoDelete(pSample);
863 
864     m_Stream.read(pSample, maxSampleSize);
865     sampleSize = (size_t)m_Stream.gcount();
866     m_Stream.clear();  // in case we reached eof
867     CStreamUtils::Stepback(m_Stream, pSample, sampleSize);
868     if (0 == sampleSize) {
869         return false;
870     }
871 
872     pSample[sampleSize] = 0;
873     if (!IsSampleNewick(pSample)) { // tolerant of embedded line breaks
874         return false;
875     }
876     return true;
877 }
878 
879 //  -----------------------------------------------------------------------------
880 bool
881 CFormatGuess::TestFormatBinaryAsn(
882     EMode /* not used */ )
883 {
884     if ( ! EnsureTestBuffer() ) {
885         return false;
886     }
887 
888     //
889     //  Criterion: Presence of any non-printing characters
890     //
891     EConfidence conf = eNo;
892     for (int i = 0;  i < m_iTestDataSize;  ++i) {
893         if ( !isgraph((unsigned char) m_pTestBuffer[i])  &&
894              !isspace((unsigned char) m_pTestBuffer[i]) )
895         {
896             if (m_pTestBuffer[i] == '\1') {
897                 conf = eMaybe;
898             } else {
899                 return true;
900             }
901         }
902     }
903     return (conf == eYes);
904 }
905 
906 
907 //  -----------------------------------------------------------------------------
908 bool
909 CFormatGuess::TestFormatDistanceMatrix(
910     EMode /* not used */ )
911 {
912     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
913         return false;
914     }
915 
916     //
917     // criteria are odd:
918     //
919     list<string>::const_iterator iter = m_TestLines.begin();
920     list<string> toks;
921 
922     /// first line: one token, one number
923     NStr::Split(*iter++, "\t ", toks);
924     if (toks.size() != 1  ||
925         toks.front().find_first_not_of("0123456789") != string::npos) {
926         return false;
927     }
928 
929     // now, for remaining ones, we expect an alphanumeric item first,
930     // followed by a set of floating-point values.  Unless we are at the last
931     // line, the number of values should increase monotonically
932     for (size_t i = 1;  iter != m_TestLines.end();  ++i, ++iter) {
933         toks.clear();
934         NStr::Split(*iter, "\t ", toks);
935         if (toks.size() != i) {
936             /// we can ignore the last line ; it may be truncated
937             list<string>::const_iterator it = iter;
938             ++it;
939             if (it != m_TestLines.end()) {
940                 return false;
941             }
942         }
943 
944         list<string>::const_iterator it = toks.begin();
945         for (++it;  it != toks.end();  ++it) {
946             if ( ! s_IsTokenDouble( *it ) ) {
947                 return false;
948             }
949         }
950     }
951 
952     return true;
953 }
954 
955 //  -----------------------------------------------------------------------------
956 bool
957 CFormatGuess::TestFormatFlatFileSequence(
958     EMode /* not used */ )
959 {
960     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
961         return false;
962     }
963 
964     ITERATE (list<string>, it, m_TestLines) {
965         if ( !IsLineFlatFileSequence( *it ) ) {
966             return false;
967         }
968     }
969     return true;
970 }
971 
972 //  -----------------------------------------------------------------------------
973 bool
974 CFormatGuess::TestFormatFiveColFeatureTable(
975     EMode /* not used */ )
976 {
977     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
978         return false;
979     }
980 
981     ITERATE( list<string>, it, m_TestLines ) {
982         if (it->empty()) {
983             continue;
984         }
985 
986         if (it->find(">Feature ") != 0) {
987             return false;
988         }
989         if (it->find_first_of(" \t", 9) != string::npos) {
990             return false;
991         }
992         break;
993     }
994 
995     return true;
996 }
997 
998 //  -----------------------------------------------------------------------------
999 bool
1000 CFormatGuess::TestFormatXml(
1001     EMode /* not used */ )
1002 {
1003     if ( ! EnsureTestBuffer() ) {
1004         return false;
1005     }
1006 
1007     string input( m_pTestBuffer, (size_t)m_iTestDataSize );
1008     NStr::TruncateSpacesInPlace( input, NStr::eTrunc_Begin );
1009 
1010     //
1011     //  Test 1: If it starts with typical XML decorations such as "<?xml..."
1012     //  then respect that:
1013     //
1014     if ( NStr::StartsWith( input, "<?XML", NStr::eNocase ) ) {
1015         return true;
1016     }
1017     if ( NStr::StartsWith( input, "<!DOCTYPE", NStr::eNocase ) ) {
1018         return true;
1019     }
1020 
1021     //
1022     //  Test 2: In the absence of XML specific declarations, check whether the
1023     //  input starts with the opening tag of a well known set of doc types:
1024     //
1025     static const char* known_types[] = {
1026         "<Blast4-request>"
1027     };
1028     const int num_types = sizeof( known_types ) / sizeof( const char* );
1029 
1030     for ( int i=0; i < num_types; ++i ) {
1031         if ( NStr::StartsWith( input, known_types[i], NStr::eCase ) ) {
1032             return true;
1033         }
1034     }
1035 
1036     return false;
1037 }
1038 
1039 //  -----------------------------------------------------------------------------
1040 bool
1041 CFormatGuess::TestFormatAlignment(
1042     EMode /* not used */ )
1043 {
1044     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1045         return false;
1046     }
1047 
1048     // Alignment files come in all different shapes and broken formats,
1049     // and some of them are hard to recognize as such, in particular
1050     // if they have been hacked up in a text editor.
1051 
1052     // This functions only concerns itself with the ones that are
1053     // easy to recognize.
1054 
1055     // Note: We can live with false negatives. Avoid false positives
1056     // at all cost.
1057 
1058     ITERATE( list<string>, it, m_TestLines ) {
1059         if ( NPOS != it->find( "#NEXUS" ) ) {
1060             return true;
1061         }
1062         if ( NPOS != it->find( "CLUSTAL" ) ) {
1063             return true;
1064         }
1065     }
1066     return false;
1067 }
1068 
1069 //  -----------------------------------------------------------------------------
1070  bool 
1071  CFormatGuess::x_TestTableDelimiter(const string& delims)
1072  {
1073     list<string>::const_iterator iter = m_TestLines.begin();
1074     list<string> toks;
1075 
1076     // Merge delims if > 1.  Do not merge single delims (since they could 
1077     // more easily represent blank fields
1078     NStr::EMergeDelims  merge_delims = NStr::eMergeDelims;
1079     if (delims.size() == 1)
1080         merge_delims = NStr::eNoMergeDelims;
1081 
1082 
1083     // Skip initial lines since not all headers start with comments like # or ;:
1084     // Don't skip though if file is very short - add up to 3, 1 for each line 
1085     // over 5:
1086     for (size_t i=5; i<7; ++i)
1087         if (m_TestLines.size() > i) ++iter;
1088 
1089     /// determine the number of observed columns
1090     size_t ncols = 0;
1091     bool found = false;
1092     for ( ;  iter != m_TestLines.end()  &&  ! found;  ++iter) {
1093         if (iter->empty()  ||  (*iter)[0] == '#'  ||  (*iter)[0] == ';') {
1094             continue;
1095         }
1096 
1097         toks.clear();
1098         NStr::Split(*iter, delims, toks);
1099         ncols = toks.size();
1100         found = true;
1101     }
1102     if ( ncols < 2 ) {
1103         return false;
1104     }
1105 
1106     size_t nlines = 1;
1107     // verify that columns all have the same size
1108     // we can add an exception for the last line
1109     for ( ;  iter != m_TestLines.end();  ++iter) {
1110         if (iter->empty()  ||  (*iter)[0] == '#'  ||  (*iter)[0] == ';') {
1111             continue;
1112         } 
1113 
1114         toks.clear();
1115         NStr::Split(*iter, delims, toks);
1116         if (toks.size() != ncols) {
1117             list<string>::const_iterator it = iter;
1118             ++it;
1119             if (it != m_TestLines.end() || (m_iTestDataSize < s_iTestBufferSize) ) {
1120                 return false;
1121             }
1122         } else {
1123             ++nlines;
1124         }
1125     }
1126     return ( nlines >= 2 );
1127  }
1128 
1129 bool
1130 CFormatGuess::TestFormatTable(
1131     EMode /* not used */ )
1132 {
1133     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1134         return false;
1135     }
1136 
1137     //
1138     //  NOTE 1:
1139     //  There is a bunch of file formats that are a special type of table and
1140     //  that we want to identify (like Repeat Masker output). So not to shade
1141     //  out those more special formats, this test should be performed only after
1142     //  all the more specialized table formats have been tested.
1143     //
1144 
1145     //
1146     //  NOTE 2:
1147     //  The original criterion for this test was "the same number of observed
1148     //  columns in every line".
1149     //  In order to weed out false positives the following *additional*
1150     //  conditions have been imposed:
1151     //  - there are at least two observed columns
1152     //  - the sample contains at least two non-comment lines.
1153     //
1154 
1155     //' ' ' \t' '\t' ',' '|'
1156     if (x_TestTableDelimiter(" "))
1157         return true;
1158     else if (x_TestTableDelimiter(" \t"))
1159         return true;
1160     else if (x_TestTableDelimiter("\t"))
1161         return true;
1162     else if (x_TestTableDelimiter(","))
1163         return true;
1164     else if (x_TestTableDelimiter("|"))
1165         return true;
1166 
1167     return false;
1168 }
1169 
1170 //  -----------------------------------------------------------------------------
1171 bool
1172 CFormatGuess::TestFormatFasta(
1173     EMode /* not used */ )
1174 {
1175     if ( ! EnsureStats() ) {
1176         return false;
1177     }
1178 
1179     // reject obvious misfits:
1180     if ( m_iTestDataSize == 0 || m_pTestBuffer[0] != '>' ) {
1181         return false;
1182     }
1183     if ( m_iStatsCountData == 0 ) {
1184         if (0.75 > double(m_iStatsCountAlNumChars)/double(m_iTestDataSize) ) {
1185             return false;
1186         }
1187         return ( NStr::Find( m_pTestBuffer, "|" ) <= 10 );
1188     }
1189 
1190     // remaining decision based on text stats:
1191     double dAlNumFraction =  (double)m_iStatsCountAlNumChars / m_iTestDataSize;
1192     double dDnaFraction = (double)m_iStatsCountDnaChars / m_iStatsCountData;
1193     double dAaFraction = (double)m_iStatsCountAaChars / m_iStatsCountData;
1194 
1195     // want at least 80% text-ish overall:
1196     if ( dAlNumFraction < 0.8 ) {
1197         return false;
1198     }
1199 
1200     // want more than 91 percent of either DNA content or AA content in what we
1201     // presume is data:
1202     if ( dDnaFraction > 0.91 || dAaFraction > 0.91 ) {
1203         return true;
1204     }
1205     return false;
1206 }
1207 
1208 //  ----------------------------------------------------------------------------
1209 bool
1210 CFormatGuess::TestFormatTextAsn(
1211     EMode /* not used */ )
1212 {
1213     if ( ! EnsureStats() ) {
1214         return false;
1215     }
1216 
1217     // reject obvious misfits:
1218     if ( m_iTestDataSize == 0 || m_pTestBuffer[0] == '>' ) {
1219         return false;
1220     }
1221 
1222     // criteria:
1223     // at least 80% text-ish,
1224     // "::=" as the 2nd field of the first non-blank non comment line.
1225     //
1226     double dAlNumFraction =  (double)m_iStatsCountAlNumChars / m_iTestDataSize;
1227     if ( dAlNumFraction < 0.80 ) {
1228         return false;
1229     }
1230 
1231     CNcbiIstrstream TestBuffer(
1232         reinterpret_cast<const char*>( m_pTestBuffer ), m_iTestDataSize );
1233     string strLine;
1234 
1235     while ( ! TestBuffer.fail() ) {
1236         vector<string> Fields;
1237         NcbiGetline( TestBuffer, strLine, "\n\r" );
1238         NStr::Tokenize( strLine, " \t", Fields, NStr::eMergeDelims );
1239         if ( IsAsnComment( Fields  ) ) {
1240             continue;
1241         }
1242         return ( Fields.size() >= 2 && Fields[1] == "::=" );
1243     }
1244     return false;
1245 }
1246 
1247 //  -----------------------------------------------------------------------------
1248 bool
1249 CFormatGuess::TestFormatTaxplot(
1250     EMode /* not used */ )
1251 {
1252     return false;
1253 }
1254 
1255 //  -----------------------------------------------------------------------------
1256 bool
1257 CFormatGuess::TestFormatSnpMarkers(
1258     EMode /* not used */ )
1259 {
1260     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1261         return false;
1262     }
1263     ITERATE( list<string>, it, m_TestLines ) {
1264         string str = *it;
1265         int rsid, chr, pos, numMatched;
1266         numMatched = sscanf( it->c_str(), "rs%d\t%d\t%d", &rsid, &chr, &pos);
1267         if ( numMatched == 3) {
1268             return true;
1269         }
1270     }
1271     return false;  
1272 }
1273 
1274 
1275 //  ----------------------------------------------------------------------------
1276 bool
1277 CFormatGuess::TestFormatBed(
1278     EMode /* not used */ )
1279 {
1280     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1281         return false;
1282     }
1283 
1284     bool bTrackLineFound( false );    
1285         bool bHasStartAndStop ( false );
1286     size_t columncount = 0;
1287     ITERATE( list<string>, it, m_TestLines ) {
1288         string str = NStr::TruncateSpaces( *it );
1289         if ( str.empty() ) {
1290             continue;
1291         }
1292                 
1293                 // 'chr 8' fixup, the bedreader does this too
1294                 if (str.find("chr ") == 0 || 
1295                         str.find("Chr ") == 0 || 
1296                         str.find("CHR ") == 0)
1297                         str.erase(3, 1);
1298 
1299         //
1300         //  while occurrence of the following decorations _is_ a good sign, they could
1301         //  also be indicator for a variety of other UCSC data formats
1302         //
1303         if ( NStr::StartsWith( str, "track" ) ) {
1304             bTrackLineFound = true;
1305             continue;
1306         }
1307         if ( NStr::StartsWith( str, "browser" ) ) {
1308             continue;
1309         }
1310         if ( NStr::StartsWith( str, "#" ) ) {
1311             continue;
1312         }
1313 
1314         vector<string> columns;
1315         NStr::Tokenize( str, " \t", columns, NStr::eMergeDelims );
1316         if (columns.size() < 3 || columns.size() > 12) {
1317             return false;
1318         }
1319         if ( columns.size() != columncount ) {
1320             if ( columncount == 0 ) {
1321                 columncount = columns.size();
1322             }
1323             else {
1324                 return false;
1325             }
1326         }
1327                 if(columns.size() >= 3) {
1328                         if (s_IsTokenPosInt(columns[1]) &&
1329                 s_IsTokenPosInt(columns[2])) {
1330                                 bHasStartAndStop = true;
1331                         }
1332                 }
1333     }
1334 
1335     return (bHasStartAndStop || bTrackLineFound);
1336 }
1337 
1338 //  ----------------------------------------------------------------------------
1339 bool
1340 CFormatGuess::TestFormatBed15(
1341     EMode /* not used */ )
1342 {
1343     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1344         return false;
1345     }
1346 
1347     bool LineFound = false;
1348     size_t columncount = 15;
1349     ITERATE( list<string>, it, m_TestLines ) {
1350         if ( NStr::TruncateSpaces( *it ).empty() ) {
1351             continue;
1352         }
1353         //
1354         //  while occurrence of the following decorations _is_ a good sign, they could
1355         //  also be indicator for a variety of other UCSC data formats
1356         //
1357         if ( NStr::StartsWith( *it, "track" ) ) {
1358             continue;
1359         }
1360         if ( NStr::StartsWith( *it, "browser" ) ) {
1361             continue;
1362         }
1363         if ( NStr::StartsWith( *it, "#" ) ) {
1364             continue;
1365         }
1366 
1367         vector<string> columns;
1368         NStr::Tokenize( *it, " \t", columns, NStr::eMergeDelims );
1369         if ( columns.size() != columncount ) {
1370             return false;
1371         } else {
1372             if (!s_IsTokenPosInt(columns[1]) ||   //chr start
1373                 !s_IsTokenPosInt(columns[2]) ||   //chr end
1374                 !s_IsTokenPosInt(columns[4]) ||   //score
1375                 !s_IsTokenPosInt(columns[6]) ||   //thick draw start
1376                 !s_IsTokenPosInt(columns[7]))     //thick draw end
1377                     return false;
1378             string strand = NStr::TruncateSpaces(columns[5]);
1379             
1380             if (strand != "+" && strand != "-")
1381                 return false;
1382 
1383             LineFound = true;
1384         }
1385     }
1386     return LineFound;
1387 }
1388 
1389 //  ----------------------------------------------------------------------------
1390 bool
1391 CFormatGuess::TestFormatWiggle(
1392     EMode /* not used */ )
1393 {
1394     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1395         return false;
1396     }
1397     ITERATE( list<string>, it, m_TestLines ) {
1398         if ( NStr::StartsWith( *it, "track" ) ) {
1399             if ( NStr::Find( *it, "type=wiggle_0" ) != NPOS ) {
1400                 return true;
1401             }
1402             if ( NStr::Find( *it, "type=bedGraph" ) != NPOS ) {
1403                 return true;
1404             }
1405         }
1406         if ( NStr::StartsWith(*it, "fixedStep") ) { /* MSS-140 */
1407             if ( NStr::Find(*it, "chrom=")  &&  NStr::Find(*it, "start=") ) {
1408                 return true;
1409             } 
1410         }
1411         if ( NStr::StartsWith(*it, "variableStep") ) { /* MSS-140 */
1412             if ( NStr::Find(*it, "chrom=") ) {
1413                 return true;
1414             }
1415             return true;
1416         }
1417     }
1418     return false;
1419 }
1420 
1421 //  ----------------------------------------------------------------------------
1422 bool
1423 CFormatGuess::TestFormatHgvs(
1424     EMode /* not used */ )
1425 {
1426     if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1427         return false;
1428     }
1429 
1430     unsigned int uHgvsLineCount = 0;
1431     list<string>::iterator it = m_TestLines.begin();
1432 
1433     for ( ;  it != m_TestLines.end();  ++it) {
1434         if ( it->empty() || (*it)[0] == '#' ) {
1435             continue;
1436         }
1437         if ( ! IsLineHgvs( *it ) ) {
1438             return false;
1439         }
1440         ++uHgvsLineCount;
1441     }
1442     return (uHgvsLineCount != 0);
1443 }
1444 
1445 
1446 //  ----------------------------------------------------------------------------
1447 bool
1448 CFormatGuess::TestFormatZip(
1449     EMode /* not used */ )
1450 {
1451     if ( ! EnsureTestBuffer() ) {
1452         return false;
1453     }
1454 
1455     // check if the first two bytes match with the zip magic number: 0x504B,
1456     // or BK and the next two bytes match with any of 0x0102, 0x0304, 0x0506
1457     // and 0x0708.
1458     if ( m_iTestDataSize < 4) {
1459         return false;
1460     }
1461 
1462     if (m_pTestBuffer[0] == 'P'  &&  m_pTestBuffer[1] == 'K'  &&
1463         ((m_pTestBuffer[2] == (char)1  &&  m_pTestBuffer[3] == (char)2)  ||
1464          (m_pTestBuffer[2] == (char)3  &&  m_pTestBuffer[3] == (char)4)  ||
1465          (m_pTestBuffer[2] == (char)5  &&  m_pTestBuffer[3] == (char)6) ||
1466          (m_pTestBuffer[2] == (char)7  &&  m_pTestBuffer[3] == (char)8) ) ) {
1467         return true;
1468     }
1469 
1470     return false;
1471 }
1472 
1473 
1474 //  ----------------------------------------------------------------------------
1475 bool
1476 CFormatGuess::TestFormatGZip(
1477     EMode /* not used */ )
1478 {
1479     if ( ! EnsureTestBuffer() ) {
1480         return false;
1481     }
1482 
1483     // check if the first two bytes match the gzip magic number: 0x1F8B
1484     if ( m_iTestDataSize < 2) {
1485         return false;
1486     }
1487 
1488     if (m_pTestBuffer[0] == (char)31  &&  m_pTestBuffer[1] == (char)139) {
1489         return true;
1490     }
1491 
1492     return false;
1493 }
1494 
1495 
1496 //  ----------------------------------------------------------------------------
1497 bool
1498 CFormatGuess::TestFormatBZip2(
1499     EMode /* not used */ )
1500 {
1501     if ( ! EnsureTestBuffer() ) {
1502         return false;
1503     }
1504 
1505     // check if the first two bytes match with the bzip2 magic number: 0x425A,
1506     // or 'BZ' and the next two bytes match with 0x68(h) and 0x31-39(1-9)
1507     if ( m_iTestDataSize < 4) {
1508         return false;
1509     }
1510 
1511     if (m_pTestBuffer[0] == 'B'  &&  m_pTestBuffer[1] == 'Z'  &&
1512         m_pTestBuffer[2] == 'h'  &&  m_pTestBuffer[3] >= '1'  &&
1513         m_pTestBuffer[3] <= '9') {
1514         return true;
1515     }
1516 
1517     return false;
1518 }
1519 
1520 
1521 //  ----------------------------------------------------------------------------
1522 bool
1523 CFormatGuess::TestFormatLzo(
1524     EMode /* not used */ )
1525 {
1526     if ( ! EnsureTestBuffer() ) {
1527         return false;
1528     }
1529 
1530     if (m_iTestDataSize >= 3  &&  m_pTestBuffer[0] == 'L'  &&
1531         m_pTestBuffer[1] == 'Z'  &&  m_pTestBuffer[2] == 'O') {
1532         if (m_iTestDataSize == 3  ||
1533             (m_iTestDataSize > 3  &&  m_pTestBuffer[3] == '\0')) {
1534             return true;
1535         }
1536     }
1537 
1538     if (m_iTestDataSize >= 4  &&  m_pTestBuffer[1] == 'L'  &&
1539         m_pTestBuffer[2] == 'Z'  &&  m_pTestBuffer[3] == 'O') {
1540         if (m_iTestDataSize == 4  ||
1541             (m_iTestDataSize > 4  &&  m_pTestBuffer[4] == '\0')) {
1542             return true;
1543         }
1544     }
1545 
1546     return false;
1547 }
1548 
1549 
1550 bool CFormatGuess::TestFormatSra(EMode /* not used */ )
1551 {
1552     if ( !EnsureTestBuffer()  ||  m_iTestDataSize < 16
1553         ||  CTempString(m_pTestBuffer, 8) != "NCBI.sra") {
1554         return false;
1555     }
1556 
1557     if (m_pTestBuffer[8] == '\x05'  &&  m_pTestBuffer[9] == '\x03'
1558         &&  m_pTestBuffer[10] == '\x19'  &&  m_pTestBuffer[11] == '\x88') {
1559         return true;
1560     } else if (m_pTestBuffer[8] == '\x88'  &&  m_pTestBuffer[9] == '\x19'
1561         &&  m_pTestBuffer[10] == '\x03'  &&  m_pTestBuffer[11] == '\x05') {
1562         return true;
1563     } else {
1564         return false;
1565     }
1566 }
1567 
1568 bool CFormatGuess::TestFormatBam(EMode mode)
1569 {
1570     // Check for a gzip header whose first (only) extra field spans
1571     // at least six bytes and has the tag BC.
1572     return (TestFormatGZip(mode)  &&  m_iTestDataSize >= 18
1573             &&  (m_pTestBuffer[3] & 4) != 0 // extra field present
1574             &&  (static_cast<unsigned char>(m_pTestBuffer[10]) >= 6
1575                  ||  m_pTestBuffer[11] != 0) // at least six bytes
1576             &&  m_pTestBuffer[12] == 'B'  &&  m_pTestBuffer[13] == 'C');
1577 }
1578 
1579 //  ----------------------------------------------------------------------------
1580 bool CFormatGuess::TestFormatVcf(
1581     EMode)
1582 //  ----------------------------------------------------------------------------
1583 {
1584     // Currently, only look for the header line identifying the VCF version.
1585     // Waive requirement this be the first line, but still expect it to by
1586     // in the initial sample.
1587     if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1588         return false;
1589     }
1590 
1591     ITERATE( list<string>, it, m_TestLines ) {
1592         if (NStr::StartsWith(*it, "##fileformat=VCFv")) {
1593             return true;
1594         }
1595     }
1596     return false;
1597 }
1598 
1599 //  ----------------------------------------------------------------------------
1600 bool CFormatGuess::IsInputRepeatMaskerWithHeader()
1601 {
1602     //
1603     //  Repeatmasker files consist of columnar data with a couple of lines
1604     //  of column labels prepended to it (but sometimes someone strips those
1605     //  labels).
1606     //  This function tries to identify repeatmasker data by those column
1607     //  label lines. They should be the first non-blanks in the file.
1608     //
1609     string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
1610     string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
1611 
1612     //
1613     //  Purge junk lines:
1614     //
1615     list<string>::iterator it = m_TestLines.begin();
1616     for  ( ; it != m_TestLines.end(); ++it ) {
1617         NStr::TruncateSpacesInPlace( *it );
1618         if ( *it != "" ) {
1619             break;
1620         }
1621     }
1622 
1623     if ( it == m_TestLines.end() ) {
1624         return false;
1625     }
1626 
1627     //
1628     //  Verify first line of labels:
1629     //
1630     size_t current_offset = 0;
1631     for ( size_t i=0; labels_1st_line[i] != ""; ++i ) {
1632         current_offset = NStr::FindCase( *it, labels_1st_line[i], current_offset );
1633         if ( current_offset == NPOS ) {
1634             return false;
1635         }
1636     }
1637 
1638     //
1639     //  Verify second line of labels:
1640     //
1641     ++it;
1642     if ( it == m_TestLines.end() ) {
1643         return false;
1644     }
1645     current_offset = 0;
1646     for ( size_t j=0; labels_2nd_line[j] != ""; ++j ) {
1647         current_offset = NStr::FindCase( *it, labels_2nd_line[j], current_offset );
1648         if ( current_offset == NPOS ) {
1649             return false;
1650         }
1651     }
1652 
1653     //
1654     //  Should have at least one extra line:
1655     //
1656     ++it;
1657     if ( it == m_TestLines.end() ) {
1658         return false;
1659     }
1660 
1661     return true;
1662 }
1663 
1664 
1665 //  ----------------------------------------------------------------------------
1666 bool CFormatGuess::IsInputRepeatMaskerWithoutHeader()
1667 {
1668     //
1669     //  Repeatmasker files consist of columnar data with a couple of lines
1670     //  of column labels prepended to it (but sometimes someone strips those
1671     //  labels).
1672     //  This function assumes the column labels have been stripped and attempts
1673     //  to identify RMO by checking the data itself.
1674     //
1675 
1676     //
1677     //  We declare the data as RMO if we are able to parse every record in the
1678     //  sample we got:
1679     //
1680     ITERATE( list<string>, it, m_TestLines ) {
1681         string str = NStr::TruncateSpaces( *it );
1682         if ( str == "" ) {
1683             continue;
1684         }
1685         if ( ! IsLineRmo( str ) ) {
1686             return false;
1687         }
1688     }
1689 
1690     return true;
1691 }
1692 
1693 
1694 //  ----------------------------------------------------------------------------
1695 bool
1696 CFormatGuess::IsSampleNewick(
1697     const string& cline )
1698 //  ----------------------------------------------------------------------------
1699 {
1700     //  NOTE:
1701     //  See http://evolution.genetics.washington.edu/phylip/newick_doc.html
1702     //
1703     //  Note that Newick tree tend to be written out as a single long line. Thus,
1704     //  we are most likely only seeing the first part of a tree.
1705     //
1706 
1707     //  NOTE:
1708     //  MSS-112 introduced the concept of multitree files is which after the ";" 
1709     //  another tree may start. The new logic accepts files as Newick if they 
1710     //  are Newick up to and including the first semicolon. It does not look
1711     //  beyond.
1712 
1713     string line = NStr::TruncateSpaces( cline );
1714     if ( line.empty()  ||  line[0] != '(') {
1715         return false;
1716     }
1717     {{
1718         //  Strip out comments:
1719         string trimmed;
1720         bool in_comment = false;
1721         for ( size_t ii=0; line.c_str()[ii] != 0; ++ii ) {
1722             if ( ! in_comment ) {
1723                 if ( line.c_str()[ii] != '[' ) {
1724                     trimmed += line.c_str()[ii];
1725                 }
1726                 else {
1727                     in_comment = true;
1728                 }
1729             }
1730             else /* in_comment */ {
1731                 if ( line.c_str()[ii] == ']' ) {
1732                     in_comment = false;
1733                 }
1734             }
1735         }
1736         line = trimmed;
1737     }}
1738     {{
1739         //  Compress quoted labels:
1740         string trimmed;
1741         bool in_quote = false;
1742         for ( size_t ii=0; line.c_str()[ii] != 0; ++ii ) {
1743             if ( ! in_quote ) {
1744                 if ( line.c_str()[ii] != '\'' ) {
1745                     trimmed += line.c_str()[ii];
1746                 }
1747                 else {
1748                     in_quote = true;
1749                     trimmed += 'A';
1750                 }
1751             }
1752             else { /* in_quote */
1753                 if ( line.c_str()[ii] == '\'' ) {
1754                     in_quote = false;
1755                 }
1756             }
1757         }
1758         line = trimmed;
1759     }}
1760     {{
1761         //  Strip distance markers:
1762         string trimmed;
1763         size_t ii=0;
1764         while ( line.c_str()[ii] != 0 ) {
1765             if ( line.c_str()[ii] != ':' ) {
1766                 trimmed += line.c_str()[ii++];
1767             }
1768             else {
1769                 ii++;
1770                 if ( line.c_str()[ii] == '-'  || line.c_str()[ii] == '+' ) {
1771                     ii++;
1772                 }
1773                 while ( '0' <= line.c_str()[ii] && line.c_str()[ii] <= '9' ) {
1774                     ii++;
1775                 }
1776                 if ( line.c_str()[ii] == '.' ) {
1777                     ii++;
1778                     while ( '0' <= line.c_str()[ii] && line.c_str()[ii] <= '9' ) {
1779                         ii++;
1780                     }
1781                 }
1782             }
1783         }
1784         line = trimmed;
1785     }}
1786     {{
1787         //  Rough lexical analysis of what's left. Bail immediately on fault:
1788         if (line.empty()  ||  line[0] != '(') {
1789             return false;
1790         }
1791         size_t paren_count = 1;
1792         for ( size_t ii=1; line.c_str()[ii] != 0; ++ii ) {
1793             switch ( line.c_str()[ii] ) {
1794                 default: 
1795                     break;
1796                 case '(':
1797                     ++paren_count;
1798                     break;
1799                 case ')':
1800                     if ( paren_count == 0 ) {
1801                         return false;
1802                     }
1803                     --paren_count;
1804                     break;
1805                 case ',':
1806                     if ( paren_count == 0 ) {
1807                         return false;
1808                     }
1809                     break;
1810                 case ';':
1811 //                    if ( line[ii+1] != 0 ) {
1812 //                        return false;
1813 //                    }
1814                     break;
1815             }
1816         }
1817     }}
1818     return true; 
1819 }
1820 
1821 
1822 //  ----------------------------------------------------------------------------
1823 bool CFormatGuess::IsLineFlatFileSequence(
1824     const string& line )
1825 {
1826     // blocks of ten residues (or permitted punctuation characters)
1827     // with a count at the start or end; require at least four
1828     // (normally six)
1829     SIZE_TYPE pos = line.find_first_not_of("0123456789 \t");
1830     if (pos == NPOS  ||  pos + 45 >= line.size()) {
1831         return false;
1832     }
1833 
1834     for (SIZE_TYPE i = 0;  i < 45;  ++i) {
1835         char c = line[pos + i];
1836         if (i % 11 == 10) {
1837             if ( !isspace(c) ) {
1838                 return false;
1839             }
1840         } else {
1841             if ( !isalpha(c)  &&  c != '-'  &&  c != '*') {
1842                 return false;
1843             }
1844         }
1845     }
1846 
1847     return true;
1848 }
1849 
1850 
1851 //  ----------------------------------------------------------------------------
1852 bool CFormatGuess::IsLabelNewick(
1853     const string& label )
1854 {
1855     //  Starts with a string of anything other than "[]:", optionally followed by
1856     //  a single ':', followed by a number, optionally followed by a dot and
1857     //  another number.
1858     if ( NPOS != label.find_first_of( "[]" ) ) {
1859         return false;
1860     }
1861     size_t colon = label.find( ':' );
1862     if ( NPOS == colon ) {
1863         return true;
1864     }
1865     size_t dot = label.find_first_not_of( "0123456789", colon + 1 );
1866     if ( NPOS == dot ) {
1867         return true;
1868     }
1869     if ( label[ dot ] != '.' ) {
1870         return false;
1871     }
1872     size_t end = label.find_first_not_of( "0123456789", dot + 1 );
1873     return ( NPOS == end );
1874 }
1875 
1876 
1877 //  ----------------------------------------------------------------------------
1878 bool CFormatGuess::IsLineAgp( 
1879     const string& strLine )
1880 {
1881     //
1882     //  Note: The reader allows for line and endline comments starting with a '#'.
1883     //  So we accept them here, too.
1884     //
1885     string line( strLine );
1886     size_t uCommentStart = NStr::Find( line, "#" );
1887 
1888     if ( NPOS != uCommentStart ) {
1889         line = line.substr( 0, uCommentStart );
1890     }
1891     NStr::TruncateSpacesInPlace( line );
1892     if ( line.empty() ) {
1893         return true;
1894     }
1895 
1896     vector<string> tokens;
1897     if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
1898         return false;
1899     }
1900 
1901     if ( tokens[1].size() > 1 && tokens[1][0] == '-' ) {
1902         tokens[1][0] = '1';
1903     }
1904     if ( -1 == NStr::StringToNonNegativeInt( tokens[1] ) ) {
1905         return false;
1906     }
1907 
1908     if ( tokens[2].size() > 1 && tokens[2][0] == '-' ) {
1909         tokens[2][0] = '1';
1910     }
1911     if ( -1 == NStr::StringToNonNegativeInt( tokens[2] ) ) {
1912         return false;
1913     }
1914 
1915     if ( tokens[3].size() > 1 && tokens[3][0] == '-' ) {
1916         tokens[3][0] = '1';
1917     }
1918     if ( -1 == NStr::StringToNonNegativeInt( tokens[3] ) ) {
1919         return false;
1920     }
1921 
1922     if ( tokens[4].size() != 1 || NPOS == tokens[4].find_first_of( "ADFGPNOW" ) ) {
1923         return false;
1924     }
1925     if ( tokens[4] == "N" ) {
1926         if ( -1 == NStr::StringToNonNegativeInt( tokens[5] ) ) {
1927             return false;
1928         }
1929     }
1930     else {
1931         if ( -1 == NStr::StringToNonNegativeInt( tokens[6] ) ) {
1932             return false;
1933         }
1934         if ( -1 == NStr::StringToNonNegativeInt( tokens[7] ) ) {
1935             return false;
1936         }            
1937         if ( tokens.size() != 9 ) {
1938             return false;
1939         }
1940         if ( tokens[8].size() != 1 || NPOS == tokens[8].find_first_of( "+-" ) ) {
1941             return false;
1942         }
1943     }
1944 
1945     return true;
1946 }
1947 
1948 
1949 //  ----------------------------------------------------------------------------
1950 bool CFormatGuess::IsLineGlimmer3(
1951     const string& line )
1952 {
1953     list<string> toks;
1954     NStr::Split(line, "\t ", toks);
1955     if (toks.size() != 5) {
1956         return false;
1957     }
1958 
1959     list<string>::iterator i = toks.begin();
1960 
1961     /// first column: skip (ascii identifier)
1962     ++i;
1963 
1964     /// second, third columns: both ints
1965     if ( ! s_IsTokenInteger( *i++ ) ) {
1966         return false;
1967     }
1968     if ( ! s_IsTokenInteger( *i++ ) ) {
1969         return false;
1970     }
1971 
1972     /// fourth column: int in the range of -3...3
1973     if ( ! s_IsTokenInteger( *i ) ) {
1974         return false;
1975     }
1976     int frame = NStr::StringToInt( *i++ );
1977     if (frame < -3  ||  frame > 3) {
1978         return false;
1979     }
1980 
1981     /// fifth column: score; double
1982     if ( ! s_IsTokenDouble( *i ) ) {
1983         return false;
1984     }
1985 
1986     return true;
1987 }
1988 
1989 
1990 //  ----------------------------------------------------------------------------
1991 bool CFormatGuess::IsLineGtf(
1992     const string& line )
1993 {
1994     vector<string> tokens;
1995     if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
1996         return false;
1997     }
1998     if ( ! s_IsTokenPosInt( tokens[3] ) ) {
1999         return false;
2000     }
2001     if ( ! s_IsTokenPosInt( tokens[4] ) ) {
2002         return false;
2003     }
2004     if ( ! s_IsTokenDouble( tokens[5] ) ) {
2005         return false;
2006     }
2007     if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
2008         return false;
2009     }
2010     if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
2011         return false;
2012     }
2013     if ( tokens.size() < 9 || 
2014          (NPOS == tokens[8].find( "gene_id" ) && NPOS == tokens[8].find( "transcript_id" ) ) ) {
2015         return false;
2016     }
2017     return true;
2018 }
2019 
2020 
2021 //  ----------------------------------------------------------------------------
2022 bool CFormatGuess::IsLineGvf(
2023     const string& line )
2024 {
2025     vector<string> tokens;
2026     if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
2027         return false;
2028     }
2029     if ( ! s_IsTokenPosInt( tokens[3] ) ) {
2030         return false;
2031     }
2032     if ( ! s_IsTokenPosInt( tokens[4] ) ) {
2033         return false;
2034     }
2035         {{
2036                 list<string> terms;
2037                 terms.push_back("snv");
2038                 terms.push_back("cnv");
2039                 terms.push_back("copy_number_variation");
2040                 terms.push_back("gain");
2041                 terms.push_back("copy_number_gain");
2042                 terms.push_back("loss");
2043                 terms.push_back("copy_number_loss");
2044                 terms.push_back("loss_of_heterozygosity");
2045                 terms.push_back("complex");
2046                 terms.push_back("complex_substitution");
2047                 terms.push_back("complex_sequence_alteration");
2048                 terms.push_back("indel");
2049                 terms.push_back("insertion");
2050                 terms.push_back("inversion");
2051                 terms.push_back("substitution");
2052                 terms.push_back("deletion");
2053                 terms.push_back("duplication");
2054                 terms.push_back("translocation");
2055                 terms.push_back("upd");
2056                 terms.push_back("uniparental_disomy");
2057                 terms.push_back("maternal_uniparental_disomy");
2058                 terms.push_back("paternal_uniparental_disomy");
2059                 terms.push_back("tandom_duplication");
2060                 terms.push_back("structural_variation");
2061                 terms.push_back("sequence_alteration");
2062                 ITERATE(list<string>, termiter, terms) {
2063                         if(NStr::EqualNocase(*termiter, tokens[2]))
2064                                 return true;
2065                 }
2066         }}
2067         if ( ! s_IsTokenDouble( tokens[5] ) ) {
2068         return false;
2069     }
2070     if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
2071         return false;
2072     }
2073     if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
2074         return false;
2075     }
2076         if(tokens.size() >= 9) {
2077                 list<string> terms;
2078                 terms.push_back("start_range");
2079                 terms.push_back("end_range");
2080                 terms.push_back("variant_seq");
2081                 terms.push_back("genotype");
2082                 ITERATE(list<string>, termiter, terms) {
2083                         if(NStr::EqualNocase(*termiter, tokens[8]))
2084                                 return true;
2085                 }
2086         }
2087 
2088     return false;
2089 }
2090 
2091 
2092 //  ----------------------------------------------------------------------------
2093 bool CFormatGuess::IsLineGff3(
2094     const string& line )
2095 {
2096     vector<string> tokens;
2097     if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
2098         return false;
2099     }
2100     if ( ! s_IsTokenPosInt( tokens[3] ) ) {
2101         return false;
2102     }
2103     if ( ! s_IsTokenPosInt( tokens[4] ) ) {
2104         return false;
2105     }
2106     if ( ! s_IsTokenDouble( tokens[5] ) ) {
2107         return false;
2108     }
2109     if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
2110         return false;
2111     }
2112     if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
2113         return false;
2114     }
2115     if ( tokens.size() < 9 || tokens[8].empty()) {
2116         return false;
2117     }
2118     if ( tokens.size() >= 9 && tokens[8].size() > 1) {
2119         const string& col9 = tokens[8];
2120         if ( NPOS == NStr::FindNoCase(col9, "ID") &&
2121              NPOS == NStr::FindNoCase(col9, "Parent") &&
2122              NPOS == NStr::FindNoCase(col9, "Target") &&
2123              NPOS == NStr::FindNoCase(col9, "Name") &&
2124              NPOS == NStr::FindNoCase(col9, "Alias") &&
2125              NPOS == NStr::FindNoCase(col9, "Note") &&
2126              NPOS == NStr::FindNoCase(col9, "Dbxref") &&
2127              NPOS == NStr::FindNoCase(col9, "Xref") ) {
2128             return false;
2129         }
2130     }
2131 
2132     return true;
2133 }
2134 
2135 
2136 //  ----------------------------------------------------------------------------
2137 bool CFormatGuess::IsLineGff2(
2138     const string& line )
2139 {
2140     vector<string> tokens;
2141     if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
2142         return false;
2143     }
2144     if ( ! s_IsTokenPosInt( tokens[3] ) ) {
2145         return false;
2146     }
2147     if ( ! s_IsTokenPosInt( tokens[4] ) ) {
2148         return false;
2149     }
2150     if ( ! s_IsTokenDouble( tokens[5] ) ) {
2151         return false;
2152     }
2153     if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
2154         return false;
2155     }
2156     if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
2157         return false;
2158     }
2159     return true;
2160 }
2161 
2162 
2163 //  ----------------------------------------------------------------------------
2164 bool CFormatGuess::IsLinePhrapId(
2165     const string& line )
2166 {
2167     vector<string> values;
2168     if ( NStr::Tokenize( line, " \t", values, NStr::eMergeDelims ).empty() ) {
2169         return false;
2170     }
2171 
2172     //
2173     //  Old style: "^DNA \\w+ "
2174     //
2175     if ( values[0] == "DNA" ) {
2176         return true;
2177     }
2178 
2179     //
2180     //  New style: "^AS [0-9]+ [0-9]+"
2181     //
2182     if ( values[0] == "AS" ) {
2183         return ( 0 <= NStr::StringToNonNegativeInt( values[1] ) &&
2184           0 <= NStr::StringToNonNegativeInt( values[2] ) );
2185     }
2186 
2187     return false;
2188 }
2189 
2190 
2191 //  ----------------------------------------------------------------------------
2192 bool CFormatGuess::IsLineRmo(
2193     const string& line )
2194 {
2195     const size_t MIN_VALUES_PER_RECORD = 14;
2196 
2197     //
2198     //  Make sure there is enough stuff on that line:
2199     //
2200     list<string> values;
2201     if ( NStr::Split( line, " \t", values ).size() < MIN_VALUES_PER_RECORD ) {
2202         return false;
2203     }
2204 
2205     //
2206     //  Look at specific values and make sure they are of the correct type:
2207     //
2208 
2209     //  1: positive integer:
2210     list<string>::iterator it = values.begin();
2211     if ( ! s_IsTokenPosInt( *it ) ) {
2212         return false;
2213     }
2214 
2215     //  2: float:
2216     ++it;
2217     if ( ! s_IsTokenDouble( *it ) ) {
2218         return false;
2219     }
2220 
2221     //  3: float:
2222     ++it;
2223     if ( ! s_IsTokenDouble( *it ) ) {
2224         return false;
2225     }
2226 
2227     //  4: float:
2228     ++it;
2229     if ( ! s_IsTokenDouble( *it ) ) {
2230         return false;
2231     }
2232 
2233     //  5: string, not checked
2234     ++it;
2235 
2236     //  6: positive integer:
2237     ++it;
2238     if ( ! s_IsTokenPosInt( *it ) ) {
2239         return false;
2240     }
2241 
2242     //  7: positive integer:
2243     ++it;
2244     if ( ! s_IsTokenPosInt( *it ) ) {
2245         return false;
2246     }
2247 
2248     //  8: positive integer, likely in paretheses, not checked:
2249     ++it;
2250 
2251     //  9: '+' or 'C':
2252     ++it;
2253     if ( *it != "+" && *it != "C" ) {
2254         return false;
2255     }
2256 
2257     //  and that's enough for now. But there are at least two more fields 
2258     //  with values that look testable.
2259 
2260     return true;
2261 }
2262 
2263 
2264 //  ----------------------------------------------------------------------------
2265 bool
2266 CFormatGuess::IsAsnComment(
2267     const vector<string>& Fields )
2268 {
2269     if ( Fields.size() == 0 ) {
2270         return true;
2271     }
2272     return ( NStr::StartsWith( Fields[0], "--" ) );
2273 }
2274 
2275 //  ----------------------------------------------------------------------------
2276 bool
2277 CFormatGuess::EnsureSplitLines()
2278 //  ----------------------------------------------------------------------------
2279 {
2280     if ( m_bSplitDone ) {
2281         return !m_TestLines.empty();
2282     }
2283     m_bSplitDone = true;
2284 
2285     //
2286     //  Make sure the given data is ASCII before checking potential line breaks:
2287     //
2288     const size_t MIN_HIGH_RATIO = 20;
2289     size_t high_count = 0;
2290     for ( streamsize i=0; i < m_iTestDataSize; ++i ) {
2291         if ( 0x80 & m_pTestBuffer[i] ) {
2292             ++high_count;
2293         }
2294     }
2295     if ( 0 < high_count && m_iTestDataSize / high_count < MIN_HIGH_RATIO ) {
2296         return false;
2297     }
2298 
2299     //
2300     //  Let's expect at least one line break in the given data:
2301     //
2302     string data( m_pTestBuffer, (size_t)m_iTestDataSize );
2303     m_TestLines.clear();
2304 
2305     if ( string::npos != data.find( "\r\n" ) ) {
2306         NStr::Split( data, "\r\n", m_TestLines );
2307     }
2308     else if ( string::npos != data.find( "\n" ) ) {
2309         NStr::Split( data, "\n", m_TestLines );
2310     }
2311     else if ( string::npos != data.find( "\r" ) ) {
2312         NStr::Split( data, "\r", m_TestLines );
2313     }
2314     else {
2315         //single truncated line
2316         return false;
2317     }
2318 
2319     if ( m_iTestDataSize == s_iTestBufferSize   &&  m_TestLines.size() > 1 ) {
2320         m_TestLines.pop_back();
2321     }
2322     return !m_TestLines.empty();
2323 }
2324 
2325 //  ----------------------------------------------------------------------------
2326 bool
2327 CFormatGuess::IsAllComment()
2328 {
2329     // first stab - are we text?  comments are only valid if we are text
2330     size_t count = 0;
2331     size_t count_print = 0;
2332     for (int i = 0;  i < m_iTestDataSize;  ++i, ++count) {
2333         if (isprint((unsigned char) m_pTestBuffer[i])) {
2334             ++count_print;
2335         }
2336     }
2337     if (count_print < count * 0.9) {
2338         // 10% non-printing at least; likely not text
2339                 return false;
2340     }
2341 
2342     m_bSplitDone = false;
2343     m_TestLines.clear();
2344     EnsureSplitLines();
2345 
2346     ITERATE(list<string>, it, m_TestLines) {
2347         if(it->empty()) {
2348             continue;
2349         }
2350         else if(NStr::StartsWith(*it, "#")) {
2351             continue;
2352         }
2353         else if(NStr::StartsWith(*it, "--")) {
2354             continue;
2355         }
2356         else {
2357             return false;
2358         }
2359     }
2360     
2361     return true;
2362 }
2363 
2364 //  ----------------------------------------------------------------------------
2365 bool CFormatGuess::IsLineHgvs(
2366     const string& line )
2367 {
2368     // This simple check can mistake Newwick, so Newwick is checked first
2369     //  /:(g|c|r|p|m|mt|n)\./  as in NC_000001.9:g.1234567C>T
2370     int State = 0;
2371     ITERATE(string, Iter, line) {
2372         char Char = *Iter;
2373         char Next = '\0';
2374         string::const_iterator NextI = Iter;
2375         ++NextI;
2376         if(NextI != line.end())
2377             Next = *NextI;
2378         
2379         if(State == 0) {
2380             if(Char == ':')
2381                 State = 1;
2382         } else if(State == 1) {
2383             if (Char == 'g' ||
2384                 Char == 'c' ||
2385                 Char == 'r' ||
2386                 Char == 'p' ||
2387                 Char == 'n' ||
2388                 Char == 'm' ) {
2389                 State = 2;
2390                 if (Char=='m' && Next == 't') {
2391                     ++Iter;
2392                 }
2393             }
2394         } else if(State == 2) {
2395             if(Char == '.') 
2396                 State = 3;
2397         }
2398     }
2399     
2400     return (State == 3);    
2401 }
2402 
2403 
2404 
2405 END_NCBI_SCOPE
2406 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.