|
NCBI Home IEB Home C++ Toolkit docs C Toolkit source browser C Toolkit source browser (2) |
NCBI C++ Toolkit Cross ReferenceC++/src/util/format_guess.cpp |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: format_guess.cpp 57368 2013-03-01 14:47:09Z falkrb $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Anatoliy Kuznetsov
27 *
28 * File Description: Implemented methods to identify file formats.
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <util/format_guess.hpp>
34 #include <util/util_exception.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/stream_utils.hpp>
38
39 BEGIN_NCBI_SCOPE
40
41 enum ESymbolType {
42 fDNA_Main_Alphabet = 1<<0, ///< Just ACGTUN-.
43 fDNA_Ambig_Alphabet = 1<<1, ///< Anything else representable in ncbi4na.
44 fProtein_Alphabet = 1<<2, ///< Allows BZX*-, but not JOU.
45 fLineEnd = 1<<3,
46 fAlpha = 1<<4,
47 fDigit = 1<<5,
48 fSpace = 1<<6,
49 fInvalid = 1<<7
50 };
51
52 enum EConfidence {
53 eNo = 0,
54 eMaybe,
55 eYes
56 };
57
58
59 // ============================================================================
60 // Helper routine--- file scope only:
61 // ============================================================================
62
63 static unsigned char symbol_type_table[256];
64
65 // ----------------------------------------------------------------------------
66 static bool s_IsTokenPosInt(
67 const string& strToken )
68 {
69 return ( -1 != NStr::StringToNonNegativeInt( strToken ) );
70 }
71
72 // ----------------------------------------------------------------------------
73 static bool s_IsTokenInteger(
74 const string& strToken )
75 // ----------------------------------------------------------------------------
76 {
77 if ( ! strToken.empty() && strToken[0] == '-' ) {
78 return s_IsTokenPosInt( strToken.substr( 1 ) );
79 }
80 return s_IsTokenPosInt( strToken );
81 }
82
83 // ----------------------------------------------------------------------------
84 static bool s_IsTokenDouble(
85 const string& strToken )
86 {
87 string token( strToken );
88 NStr::ReplaceInPlace( token, ".", "1", 0, 1 );
89 if ( token.size() > 1 && token[0] == '-' ) {
90 token[0] = '1';
91 }
92 return s_IsTokenPosInt( token );
93 }
94
95 // ----------------------------------------------------------------------------
96 static void init_symbol_type_table(void)
97 {
98 if ( symbol_type_table[0] == 0 ) {
99 for ( const char* s = "ACGNTU"; *s; ++s ) {
100 unsigned char c = *s;
101 symbol_type_table[c] |= fDNA_Main_Alphabet;
102 c = tolower(c);
103 symbol_type_table[c] |= fDNA_Main_Alphabet;
104 }
105 for ( const char* s = "BDHKMRSVWY"; *s; ++s ) {
106 unsigned char c = *s;
107 symbol_type_table[c] |= fDNA_Ambig_Alphabet;
108 c = tolower(c);
109 symbol_type_table[c] |= fDNA_Ambig_Alphabet;
110 }
111 for ( const char* s = "ACDEFGHIKLMNPQRSTVWYBZX"; *s; ++s ) {
112 unsigned char c = *s;
113 symbol_type_table[c] |= fProtein_Alphabet;
114 c = tolower(c);
115 symbol_type_table[c] |= fProtein_Alphabet;
116 }
117 symbol_type_table[(unsigned char)'-']
118 |= fDNA_Main_Alphabet | fProtein_Alphabet;
119 symbol_type_table[(unsigned char)'*'] |= fProtein_Alphabet;
120 for ( const char* s = "\r\n"; *s; ++s ) {
121 unsigned char c = *s;
122 symbol_type_table[c] |= fLineEnd;
123 }
124 for ( int c = 1; c < 256; ++c ) {
125 if ( isalpha(c) )
126 symbol_type_table[c] |= fAlpha;
127 if ( isdigit(c) )
128 symbol_type_table[c] |= fDigit;
129 if ( isspace(c) )
130 symbol_type_table[c] |= fSpace;
131 }
132 symbol_type_table[0] |= fInvalid;
133 }
134 }
135
136 // ----------------------------------------------------------------------------
137 int
138 CFormatGuess::s_CheckOrder[] =
139 // ----------------------------------------------------------------------------
140 {
141 // must list all EFormats except eUnknown and eFormat_max. Will cause
142 // assertion if violated!
143 //
144 eBam, // must precede eGZip!
145 eZip,
146 eGZip,
147 eBZip2,
148 eLzo,
149 eSra,
150 eRmo,
151 eVcf,
152 eGtf,
153 eGvf,
154 eGff3,
155 eGff2,
156 eGlimmer3,
157 eAgp,
158 eXml,
159 eWiggle,
160 eBed,
161 eBed15,
162 eNewick,
163 eHgvs,
164 eAlignment,
165 eDistanceMatrix,
166 eFlatFileSequence,
167 eFiveColFeatureTable,
168 eSnpMarkers,
169 eFasta,
170 eTextASN,
171 eTaxplot,
172 ePhrapAce,
173 eTable,
174 eBinaryASN,
175 };
176
177
178 // This array must stay in sync with enum EFormat, but that's not
179 // supposed to change in the middle anyway, so the explicit size
180 // should suffice to avoid accidental skew.
181 const char* const CFormatGuess::sm_FormatNames[CFormatGuess::eFormat_max] = {
182 "unknown",
183 "binary ASN.1",
184 "RepeatMasker",
185 "GFF/GTF Poisoned",
186 "Glimmer3",
187 "AGP",
188 "XML",
189 "WIGGLE",
190 "BED",
191 "BED15",
192 "Newick",
193 "alignment",
194 "distance matrix",
195 "flat-file sequence",
196 "five-column feature table",
197 "SNP Markers",
198 "FASTA",
199 "text ASN.1",
200 "Taxplot",
201 "Phrap ACE",
202 "table",
203 "GTF",
204 "GFF3",
205 "GFF2",
206 "HGVS",
207 "GVF",
208 "zip",
209 "gzip",
210 "bzip2",
211 "lzo",
212 "SRA",
213 "BAM",
214 "VCF",
215 };
216
217 const char*
218 CFormatGuess::GetFormatName(EFormat format)
219 {
220 unsigned int i = static_cast<unsigned int>(format);
221 if (i >= static_cast <unsigned int>(eFormat_max)) {
222 NCBI_THROW(CUtilException, eWrongData,
223 "CFormatGuess::GetFormatName: out-of-range format value "
224 + NStr::IntToString(i));
225 }
226 return sm_FormatNames[i];
227 }
228
229
230 // ============================================================================
231 // Old style class interface:
232 // ============================================================================
233
234 // ----------------------------------------------------------------------------
235 CFormatGuess::ESequenceType
236 CFormatGuess::SequenceType(const char* str, unsigned length,
237 ESTStrictness strictness)
238 {
239 if (length == 0)
240 length = (unsigned)::strlen(str);
241
242 init_symbol_type_table();
243 unsigned int main_nuc_content = 0, ambig_content = 0, bad_nuc_content = 0,
244 amino_acid_content = 0, exotic_aa_content = 0, bad_aa_content = 0;
245
246 for (unsigned i = 0; i < length; ++i) {
247 unsigned char c = str[i];
248 unsigned char type = symbol_type_table[c];
249 if ( type & fDNA_Main_Alphabet ) {
250 ++main_nuc_content;
251 } else if ( type & fDNA_Ambig_Alphabet ) {
252 ++ambig_content;
253 } else if ( !(type & (fSpace | fDigit)) ) {
254 ++bad_nuc_content;
255 }
256
257 if ( type & fProtein_Alphabet ) {
258 ++amino_acid_content;
259 } else if ( type & fAlpha ) {
260 ++exotic_aa_content;
261 } else if ( !(type & (fSpace | fDigit)) ) {
262 ++bad_aa_content;
263 }
264 }
265
266 switch (strictness) {
267 case eST_Lax:
268 {
269 double dna_content = (double)main_nuc_content / (double)length;
270 double prot_content = (double)amino_acid_content / (double)length;
271
272 if (dna_content > 0.7) {
273 return eNucleotide;
274 }
275 if (prot_content > 0.7) {
276 return eProtein;
277 }
278 }
279
280 case eST_Default:
281 if (bad_nuc_content + ambig_content <= main_nuc_content / 9
282 || (bad_nuc_content + ambig_content <= main_nuc_content / 3 &&
283 bad_nuc_content <= (main_nuc_content + ambig_content) / 19)) {
284 // >=90% main alph. (ACGTUN-) or >=75% main and >=95% 4na-encodable
285 return eNucleotide;
286 } else if (bad_aa_content + exotic_aa_content
287 <= amino_acid_content / 9) {
288 // >=90% relatively standard protein residues. (JOU don't count.)
289 return eProtein;
290 }
291
292 case eST_Strict: // Must be 100% encodable
293 if (bad_nuc_content == 0 && ambig_content <= main_nuc_content / 3) {
294 return eNucleotide;
295 } else if (bad_aa_content == 0
296 && exotic_aa_content <= amino_acid_content / 9) {
297 return eProtein;
298 }
299 }
300
301 return eUndefined;
302 }
303
304
305 // ----------------------------------------------------------------------------
306 CFormatGuess::EFormat CFormatGuess::Format(const string& path, EOnError onerror)
307 {
308 CNcbiIfstream input(path.c_str(), IOS_BASE::in | IOS_BASE::binary);
309 return Format(input);
310 }
311
312 // ----------------------------------------------------------------------------
313 CFormatGuess::EFormat CFormatGuess::Format(CNcbiIstream& input, EOnError onerror)
314 {
315 CFormatGuess FG( input );
316 return FG.GuessFormat( onerror );
317 }
318
319
320 // ============================================================================
321 // New style object interface:
322 // ============================================================================
323
324 // ----------------------------------------------------------------------------
325 CFormatGuess::CFormatGuess()
326 : m_Stream( * new CNcbiIfstream )
327 , m_bOwnsStream( true )
328 {
329 Initialize();
330 }
331
332 // ----------------------------------------------------------------------------
333 CFormatGuess::CFormatGuess(
334 const string& FileName )
335 : m_Stream( * new CNcbiIfstream( FileName.c_str() ) )
336 , m_bOwnsStream( true )
337 {
338 Initialize();
339 }
340
341 // ----------------------------------------------------------------------------
342 CFormatGuess::CFormatGuess(
343 CNcbiIstream& Stream )
344 : m_Stream( Stream )
345 , m_bOwnsStream( false )
346 {
347 Initialize();
348 }
349
350 // ----------------------------------------------------------------------------
351 CFormatGuess::~CFormatGuess()
352 {
353 delete[] m_pTestBuffer;
354 if ( m_bOwnsStream ) {
355 delete &m_Stream;
356 }
357 }
358
359 // ----------------------------------------------------------------------------
360 CFormatGuess::EFormat
361 CFormatGuess::GuessFormat( EMode )
362 {
363 return GuessFormat(eDefault);
364 }
365
366 // ----------------------------------------------------------------------------
367 CFormatGuess::EFormat
368 CFormatGuess::GuessFormat(
369 EOnError onerror )
370 {
371 if (!x_TestInput(m_Stream, onerror)) {
372 return eUnknown;
373 }
374 EMode mode = eQuick;
375 unsigned int uFormatCount = sizeof( s_CheckOrder ) / sizeof( int );
376
377 // First, try to use hints
378 if ( !m_Hints.IsEmpty() ) {
379 for (unsigned int f = 0; f < uFormatCount; ++f) {
380 EFormat fmt = EFormat( s_CheckOrder[ f ] );
381 if (m_Hints.IsPreferred(fmt) && x_TestFormat(fmt, mode)) {
382 return fmt;
383 }
384 }
385 }
386
387 // Check other formats, skip the ones that are disabled through hints
388 for (unsigned int f = 0; f < uFormatCount; ++f) {
389 EFormat fmt = EFormat( s_CheckOrder[ f ] );
390 if ( ! m_Hints.IsDisabled(fmt) && x_TestFormat(fmt, mode) ) {
391 return fmt;
392 }
393 }
394 return eUnknown;
395 }
396
397 // ----------------------------------------------------------------------------
398 bool
399 CFormatGuess::TestFormat( EFormat format, EMode )
400 {
401 return TestFormat( format, eDefault);
402 }
403
404 // ----------------------------------------------------------------------------
405 bool
406 CFormatGuess::TestFormat(
407 EFormat format,
408 EOnError onerror )
409 {
410 if (format != eUnknown && !x_TestInput(m_Stream, onerror)) {
411 return false;
412 }
413 EMode mode = eQuick;
414 return x_TestFormat(format, mode);
415 }
416
417 // ----------------------------------------------------------------------------
418 bool CFormatGuess::x_TestFormat(EFormat format, EMode mode)
419 {
420 // First check if the format is disabled
421 if ( m_Hints.IsDisabled(format) ) {
422 return false;
423 }
424
425 switch( format ) {
426
427 case eBinaryASN:
428 return TestFormatBinaryAsn( mode );
429 case eRmo:
430 return TestFormatRepeatMasker( mode );
431 case eGtf:
432 return TestFormatGtf( mode );
433 case eGvf:
434 return TestFormatGvf( mode );
435 case eGff3:
436 return TestFormatGff3( mode );
437 case eGff2:
438 return TestFormatGff2( mode );
439 case eGlimmer3:
440 return TestFormatGlimmer3( mode );
441 case eAgp:
442 return TestFormatAgp( mode );
443 case eXml:
444 return TestFormatXml( mode );
445 case eWiggle:
446 return TestFormatWiggle( mode );
447 case eBed:
448 return TestFormatBed( mode );
449 case eBed15:
450 return TestFormatBed15( mode );
451 case eNewick:
452 return TestFormatNewick( mode );
453 case eAlignment:
454 return TestFormatAlignment( mode );
455 case eDistanceMatrix:
456 return TestFormatDistanceMatrix( mode );
457 case eFlatFileSequence:
458 return TestFormatFlatFileSequence( mode );
459 case eFiveColFeatureTable:
460 return TestFormatFiveColFeatureTable( mode );
461 case eSnpMarkers:
462 return TestFormatSnpMarkers( mode );
463 case eFasta:
464 return TestFormatFasta( mode );
465 case eTextASN:
466 return TestFormatTextAsn( mode );
467 case eTaxplot:
468 return TestFormatTaxplot( mode );
469 case ePhrapAce:
470 return TestFormatPhrapAce( mode );
471 case eTable:
472 return TestFormatTable( mode );
473 case eHgvs:
474 return TestFormatHgvs( mode );
475 case eZip:
476 return TestFormatZip( mode );
477 case eGZip:
478 return TestFormatGZip( mode );
479 case eBZip2:
480 return TestFormatBZip2( mode );
481 case eLzo:
482 return TestFormatLzo( mode );
483 case eSra:
484 return TestFormatSra( mode );
485 case eBam:
486 return TestFormatBam( mode );
487 case eVcf:
488 return TestFormatVcf( mode );
489 default:
490 NCBI_THROW( CCoreException, eInvalidArg,
491 "CFormatGuess::x_TestFormat(): Unsupported format ID." );
492 }
493 }
494
495 // ----------------------------------------------------------------------------
496 void
497 CFormatGuess::Initialize()
498 {
499 NCBI_ASSERT(eFormat_max-2 == sizeof( s_CheckOrder ) / sizeof( int ),
500 "Indices in s_CheckOrder do not match format count ---"
501 "update s_CheckOrder to list all formats"
502 );
503 NCBI_ASSERT(eFormat_max == sizeof(sm_FormatNames) / sizeof(const char*)
504 && sm_FormatNames[eFormat_max - 1] != NULL,
505 "sm_FormatNames does not list all possible formats");
506 m_pTestBuffer = 0;
507
508 m_bStatsAreValid = false;
509 m_bSplitDone = false;
510 m_iStatsCountData = 0;
511 m_iStatsCountAlNumChars = 0;
512 m_iStatsCountDnaChars = 0;
513 m_iStatsCountAaChars = 0;
514 }
515
516 // ----------------------------------------------------------------------------
517 bool
518 CFormatGuess::EnsureTestBuffer()
519 {
520 if ( m_pTestBuffer ) {
521 return true;
522 }
523 if ( ! m_Stream.good() ) {
524 return false;
525 }
526
527 // Fix to the all-comment problem.
528 // Read a test buffer,
529 // Test it for being all comment
530 // If its all comment, read a twice as long buffer
531 // Stop when its no longer all comment, end of the stream,
532 // or Multiplier hits 1024
533 int Multiplier = 1;
534 while(true) {
535 m_pTestBuffer = new char[ Multiplier * s_iTestBufferSize ];
536 m_Stream.read( m_pTestBuffer, Multiplier * s_iTestBufferSize );
537 m_iTestDataSize = m_Stream.gcount();
538 m_Stream.clear(); // in case we reached eof
539 CStreamUtils::Stepback( m_Stream, m_pTestBuffer, m_iTestDataSize );
540
541 if (IsAllComment()) {
542 Multiplier *= 2;
543 delete [] m_pTestBuffer;
544 m_pTestBuffer = NULL;
545 if (Multiplier >= 1024 || m_iTestDataSize < ((Multiplier/2) * s_iTestBufferSize) ) {
546 return false;
547 }
548 continue;
549 } else {
550 break;
551 }
552 }
553
554 return true;
555 }
556
557 // ----------------------------------------------------------------------------
558 bool
559 CFormatGuess::EnsureStats()
560 {
561 if ( m_bStatsAreValid ) {
562 return true;
563 }
564 if ( ! EnsureTestBuffer() ) {
565 return false;
566 }
567 if ( m_iTestDataSize == 0 ) {
568 m_bStatsAreValid = true;
569 return true;
570 }
571
572 CNcbiIstrstream TestBuffer(
573 reinterpret_cast<const char*>( m_pTestBuffer ), m_iTestDataSize );
574 string strLine;
575
576 init_symbol_type_table();
577 // Things we keep track of:
578 // m_iStatsCountAlNumChars: number of characters that are letters or
579 // digits
580 // m_iStatsCountData: number of characters not part of a line starting
581 // with '>', ignoring whitespace
582 // m_iStatsCountDnaChars: number of characters counted in m_iStatsCountData
583 // from the DNA alphabet
584 // m_iStatsCountAaChars: number of characters counted in m_iStatsCountData
585 // from the AA alphabet
586 //
587 while ( ! TestBuffer.fail() ) {
588 NcbiGetlineEOL( TestBuffer, strLine );
589 // code in CFormatGuess::Format counts line ends
590 // so, we will count them here as well
591 if (!strLine.empty()) {
592 strLine += '\n';
593 }
594 size_t size = strLine.size();
595 bool is_header = size > 0 && strLine[0] == '>';
596 for ( size_t i=0; i < size; ++i ) {
597 unsigned char c = strLine[i];
598 unsigned char type = symbol_type_table[c];
599
600 if ( type & (fAlpha | fDigit | fSpace) ) {
601 ++m_iStatsCountAlNumChars;
602 }
603 if ( !is_header ) {
604 if ( !(type & fSpace) ) {
605 ++m_iStatsCountData;
606 }
607
608 if ( type & fDNA_Main_Alphabet ) {
609 ++m_iStatsCountDnaChars;
610 }
611 if ( type & fProtein_Alphabet ) {
612 ++m_iStatsCountAaChars;
613 }
614 if ( type & fLineEnd ) {
615 ++m_iStatsCountAlNumChars;
616 --m_iStatsCountData;
617 }
618 }
619 }
620 }
621 m_bStatsAreValid = true;
622 return true;
623 }
624
625 // ----------------------------------------------------------------------------
626 bool CFormatGuess::x_TestInput( CNcbiIstream& input, EOnError onerror )
627 {
628 if (!input) {
629 if (onerror == eThrowOnBadSource) {
630 NCBI_THROW(CUtilException,eNoInput,"Unreadable input stream");
631 }
632 return false;
633 }
634 return true;
635 }
636
637 // ----------------------------------------------------------------------------
638 bool
639 CFormatGuess::TestFormatRepeatMasker(
640 EMode /* not used */ )
641 {
642 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
643 return false;
644 }
645 return IsInputRepeatMaskerWithHeader() ||
646 IsInputRepeatMaskerWithoutHeader();
647 }
648
649 // ----------------------------------------------------------------------------
650 bool
651 CFormatGuess::TestFormatPhrapAce(
652 EMode /* not used */ )
653 {
654 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
655 return false;
656 }
657
658 ITERATE( list<string>, it, m_TestLines ) {
659 if ( IsLinePhrapId( *it ) ) {
660 return true;
661 }
662 }
663 return false;
664 }
665
666 // -----------------------------------------------------------------------------
667 bool
668 CFormatGuess::TestFormatGtf(
669 EMode /* not used */ )
670 {
671 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
672 return false;
673 }
674
675 unsigned int uGtfLineCount = 0;
676 list<string>::iterator it = m_TestLines.begin();
677
678 for ( ; it != m_TestLines.end(); ++it) {
679 //
680 // Make sure to ignore any UCSC track and browser lines prior to the
681 // start of data
682 //
683 if ( it->empty() || (*it)[0] == '#' ) {
684 continue;
685 }
686 if ( !uGtfLineCount && NStr::StartsWith( *it, "browser " ) ) {
687 continue;
688 }
689 if ( !uGtfLineCount && NStr::StartsWith( *it, "track " ) ) {
690 continue;
691 }
692 if ( ! IsLineGtf( *it ) ) {
693 return false;
694 }
695 ++uGtfLineCount;
696 }
697 return (uGtfLineCount != 0);
698 }
699
700 // -----------------------------------------------------------------------------
701 bool
702 CFormatGuess::TestFormatGvf(
703 EMode /* not used */ )
704 {
705 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
706 return false;
707 }
708
709 unsigned int uGvfLineCount = 0;
710 list<string>::iterator it = m_TestLines.begin();
711
712 for ( ; it != m_TestLines.end(); ++it) {
713 //
714 // Make sure to ignore any UCSC track and browser lines prior to the
715 // start of data
716 //
717 if ( it->empty() || (*it)[0] == '#' ) {
718 continue;
719 }
720 if ( !uGvfLineCount && NStr::StartsWith( *it, "browser " ) ) {
721 continue;
722 }
723 if ( !uGvfLineCount && NStr::StartsWith( *it, "track " ) ) {
724 continue;
725 }
726 if ( ! IsLineGvf( *it ) ) {
727 return false;
728 }
729 ++uGvfLineCount;
730 }
731 return (uGvfLineCount != 0);
732 }
733
734
735 // -----------------------------------------------------------------------------
736 bool
737 CFormatGuess::TestFormatGff3(
738 EMode /* not used */ )
739 {
740 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
741 return false;
742 }
743
744 unsigned int uGffLineCount = 0;
745 list<string>::iterator it = m_TestLines.begin();
746
747 for ( ; it != m_TestLines.end(); ++it) {
748 //
749 // Make sure to ignore any UCSC track and browser lines prior to the
750 // start of data
751 //
752 if ( it->empty() || (*it)[0] == '#' ) {
753 continue;
754 }
755 if ( !uGffLineCount && NStr::StartsWith( *it, "browser " ) ) {
756 continue;
757 }
758 if ( !uGffLineCount && NStr::StartsWith( *it, "track " ) ) {
759 continue;
760 }
761 if ( ! IsLineGff3( *it ) ) {
762 return false;
763 }
764 ++uGffLineCount;
765 }
766 return (uGffLineCount != 0);
767 }
768
769
770 // -----------------------------------------------------------------------------
771 bool
772 CFormatGuess::TestFormatGff2(
773 EMode /* not used */ )
774 {
775 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
776 return false;
777 }
778
779 unsigned int uGffLineCount = 0;
780 list<string>::iterator it = m_TestLines.begin();
781
782 for ( ; it != m_TestLines.end(); ++it) {
783 //
784 // Make sure to ignore any UCSC track and browser lines prior to the
785 // start of data
786 //
787 if ( it->empty() || (*it)[0] == '#' ) {
788 continue;
789 }
790 if ( !uGffLineCount && NStr::StartsWith( *it, "browser " ) ) {
791 continue;
792 }
793 if ( !uGffLineCount && NStr::StartsWith( *it, "track " ) ) {
794 continue;
795 }
796 if ( ! IsLineGff2( *it ) ) {
797 return false;
798 }
799 ++uGffLineCount;
800 }
801 return (uGffLineCount != 0);
802 }
803
804
805 // -----------------------------------------------------------------------------
806 bool
807 CFormatGuess::TestFormatGlimmer3(
808 EMode /* not used */ )
809 {
810 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
811 return false;
812 }
813
814 /// first line should be a FASTA defline
815 list<string>::iterator it = m_TestLines.begin();
816 if (it->empty() || (*it)[0] != '>') {
817 return false;
818 }
819
820 /// there should be additional data lines, and they should be easily parseable,
821 /// with five columns
822 ++it;
823 if (it == m_TestLines.end()) {
824 return false;
825 }
826 for ( /**/; it != m_TestLines.end(); ++it) {
827 if ( !IsLineGlimmer3( *it ) ) {
828 return false;
829 }
830 }
831 return true;
832 }
833
834 // -----------------------------------------------------------------------------
835 bool
836 CFormatGuess::TestFormatAgp(
837 EMode /* not used */ )
838 {
839 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
840 return false;
841 }
842 ITERATE( list<string>, it, m_TestLines ) {
843 if ( !IsLineAgp( *it ) ) {
844 return false;
845 }
846 }
847 return true;
848 }
849
850 // -----------------------------------------------------------------------------
851 bool
852 CFormatGuess::TestFormatNewick(
853 EMode /* not used */ )
854 {
855 // -----------------------------------------------------------------------------
856 // special newick consideration:
857 // newick files may come with all data cramped into a single run-on line,
858 // that single oversized line may not have a line terminator
859 const size_t maxSampleSize = 8*1024-1;
860 size_t sampleSize = 0;
861 char* pSample = new char[maxSampleSize+1];
862 AutoArray<char> autoDelete(pSample);
863
864 m_Stream.read(pSample, maxSampleSize);
865 sampleSize = (size_t)m_Stream.gcount();
866 m_Stream.clear(); // in case we reached eof
867 CStreamUtils::Stepback(m_Stream, pSample, sampleSize);
868 if (0 == sampleSize) {
869 return false;
870 }
871
872 pSample[sampleSize] = 0;
873 if (!IsSampleNewick(pSample)) { // tolerant of embedded line breaks
874 return false;
875 }
876 return true;
877 }
878
879 // -----------------------------------------------------------------------------
880 bool
881 CFormatGuess::TestFormatBinaryAsn(
882 EMode /* not used */ )
883 {
884 if ( ! EnsureTestBuffer() ) {
885 return false;
886 }
887
888 //
889 // Criterion: Presence of any non-printing characters
890 //
891 EConfidence conf = eNo;
892 for (int i = 0; i < m_iTestDataSize; ++i) {
893 if ( !isgraph((unsigned char) m_pTestBuffer[i]) &&
894 !isspace((unsigned char) m_pTestBuffer[i]) )
895 {
896 if (m_pTestBuffer[i] == '\1') {
897 conf = eMaybe;
898 } else {
899 return true;
900 }
901 }
902 }
903 return (conf == eYes);
904 }
905
906
907 // -----------------------------------------------------------------------------
908 bool
909 CFormatGuess::TestFormatDistanceMatrix(
910 EMode /* not used */ )
911 {
912 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
913 return false;
914 }
915
916 //
917 // criteria are odd:
918 //
919 list<string>::const_iterator iter = m_TestLines.begin();
920 list<string> toks;
921
922 /// first line: one token, one number
923 NStr::Split(*iter++, "\t ", toks);
924 if (toks.size() != 1 ||
925 toks.front().find_first_not_of("0123456789") != string::npos) {
926 return false;
927 }
928
929 // now, for remaining ones, we expect an alphanumeric item first,
930 // followed by a set of floating-point values. Unless we are at the last
931 // line, the number of values should increase monotonically
932 for (size_t i = 1; iter != m_TestLines.end(); ++i, ++iter) {
933 toks.clear();
934 NStr::Split(*iter, "\t ", toks);
935 if (toks.size() != i) {
936 /// we can ignore the last line ; it may be truncated
937 list<string>::const_iterator it = iter;
938 ++it;
939 if (it != m_TestLines.end()) {
940 return false;
941 }
942 }
943
944 list<string>::const_iterator it = toks.begin();
945 for (++it; it != toks.end(); ++it) {
946 if ( ! s_IsTokenDouble( *it ) ) {
947 return false;
948 }
949 }
950 }
951
952 return true;
953 }
954
955 // -----------------------------------------------------------------------------
956 bool
957 CFormatGuess::TestFormatFlatFileSequence(
958 EMode /* not used */ )
959 {
960 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
961 return false;
962 }
963
964 ITERATE (list<string>, it, m_TestLines) {
965 if ( !IsLineFlatFileSequence( *it ) ) {
966 return false;
967 }
968 }
969 return true;
970 }
971
972 // -----------------------------------------------------------------------------
973 bool
974 CFormatGuess::TestFormatFiveColFeatureTable(
975 EMode /* not used */ )
976 {
977 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
978 return false;
979 }
980
981 ITERATE( list<string>, it, m_TestLines ) {
982 if (it->empty()) {
983 continue;
984 }
985
986 if (it->find(">Feature ") != 0) {
987 return false;
988 }
989 if (it->find_first_of(" \t", 9) != string::npos) {
990 return false;
991 }
992 break;
993 }
994
995 return true;
996 }
997
998 // -----------------------------------------------------------------------------
999 bool
1000 CFormatGuess::TestFormatXml(
1001 EMode /* not used */ )
1002 {
1003 if ( ! EnsureTestBuffer() ) {
1004 return false;
1005 }
1006
1007 string input( m_pTestBuffer, (size_t)m_iTestDataSize );
1008 NStr::TruncateSpacesInPlace( input, NStr::eTrunc_Begin );
1009
1010 //
1011 // Test 1: If it starts with typical XML decorations such as "<?xml..."
1012 // then respect that:
1013 //
1014 if ( NStr::StartsWith( input, "<?XML", NStr::eNocase ) ) {
1015 return true;
1016 }
1017 if ( NStr::StartsWith( input, "<!DOCTYPE", NStr::eNocase ) ) {
1018 return true;
1019 }
1020
1021 //
1022 // Test 2: In the absence of XML specific declarations, check whether the
1023 // input starts with the opening tag of a well known set of doc types:
1024 //
1025 static const char* known_types[] = {
1026 "<Blast4-request>"
1027 };
1028 const int num_types = sizeof( known_types ) / sizeof( const char* );
1029
1030 for ( int i=0; i < num_types; ++i ) {
1031 if ( NStr::StartsWith( input, known_types[i], NStr::eCase ) ) {
1032 return true;
1033 }
1034 }
1035
1036 return false;
1037 }
1038
1039 // -----------------------------------------------------------------------------
1040 bool
1041 CFormatGuess::TestFormatAlignment(
1042 EMode /* not used */ )
1043 {
1044 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1045 return false;
1046 }
1047
1048 // Alignment files come in all different shapes and broken formats,
1049 // and some of them are hard to recognize as such, in particular
1050 // if they have been hacked up in a text editor.
1051
1052 // This functions only concerns itself with the ones that are
1053 // easy to recognize.
1054
1055 // Note: We can live with false negatives. Avoid false positives
1056 // at all cost.
1057
1058 ITERATE( list<string>, it, m_TestLines ) {
1059 if ( NPOS != it->find( "#NEXUS" ) ) {
1060 return true;
1061 }
1062 if ( NPOS != it->find( "CLUSTAL" ) ) {
1063 return true;
1064 }
1065 }
1066 return false;
1067 }
1068
1069 // -----------------------------------------------------------------------------
1070 bool
1071 CFormatGuess::x_TestTableDelimiter(const string& delims)
1072 {
1073 list<string>::const_iterator iter = m_TestLines.begin();
1074 list<string> toks;
1075
1076 // Merge delims if > 1. Do not merge single delims (since they could
1077 // more easily represent blank fields
1078 NStr::EMergeDelims merge_delims = NStr::eMergeDelims;
1079 if (delims.size() == 1)
1080 merge_delims = NStr::eNoMergeDelims;
1081
1082
1083 // Skip initial lines since not all headers start with comments like # or ;:
1084 // Don't skip though if file is very short - add up to 3, 1 for each line
1085 // over 5:
1086 for (size_t i=5; i<7; ++i)
1087 if (m_TestLines.size() > i) ++iter;
1088
1089 /// determine the number of observed columns
1090 size_t ncols = 0;
1091 bool found = false;
1092 for ( ; iter != m_TestLines.end() && ! found; ++iter) {
1093 if (iter->empty() || (*iter)[0] == '#' || (*iter)[0] == ';') {
1094 continue;
1095 }
1096
1097 toks.clear();
1098 NStr::Split(*iter, delims, toks);
1099 ncols = toks.size();
1100 found = true;
1101 }
1102 if ( ncols < 2 ) {
1103 return false;
1104 }
1105
1106 size_t nlines = 1;
1107 // verify that columns all have the same size
1108 // we can add an exception for the last line
1109 for ( ; iter != m_TestLines.end(); ++iter) {
1110 if (iter->empty() || (*iter)[0] == '#' || (*iter)[0] == ';') {
1111 continue;
1112 }
1113
1114 toks.clear();
1115 NStr::Split(*iter, delims, toks);
1116 if (toks.size() != ncols) {
1117 list<string>::const_iterator it = iter;
1118 ++it;
1119 if (it != m_TestLines.end() || (m_iTestDataSize < s_iTestBufferSize) ) {
1120 return false;
1121 }
1122 } else {
1123 ++nlines;
1124 }
1125 }
1126 return ( nlines >= 2 );
1127 }
1128
1129 bool
1130 CFormatGuess::TestFormatTable(
1131 EMode /* not used */ )
1132 {
1133 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1134 return false;
1135 }
1136
1137 //
1138 // NOTE 1:
1139 // There is a bunch of file formats that are a special type of table and
1140 // that we want to identify (like Repeat Masker output). So not to shade
1141 // out those more special formats, this test should be performed only after
1142 // all the more specialized table formats have been tested.
1143 //
1144
1145 //
1146 // NOTE 2:
1147 // The original criterion for this test was "the same number of observed
1148 // columns in every line".
1149 // In order to weed out false positives the following *additional*
1150 // conditions have been imposed:
1151 // - there are at least two observed columns
1152 // - the sample contains at least two non-comment lines.
1153 //
1154
1155 //' ' ' \t' '\t' ',' '|'
1156 if (x_TestTableDelimiter(" "))
1157 return true;
1158 else if (x_TestTableDelimiter(" \t"))
1159 return true;
1160 else if (x_TestTableDelimiter("\t"))
1161 return true;
1162 else if (x_TestTableDelimiter(","))
1163 return true;
1164 else if (x_TestTableDelimiter("|"))
1165 return true;
1166
1167 return false;
1168 }
1169
1170 // -----------------------------------------------------------------------------
1171 bool
1172 CFormatGuess::TestFormatFasta(
1173 EMode /* not used */ )
1174 {
1175 if ( ! EnsureStats() ) {
1176 return false;
1177 }
1178
1179 // reject obvious misfits:
1180 if ( m_iTestDataSize == 0 || m_pTestBuffer[0] != '>' ) {
1181 return false;
1182 }
1183 if ( m_iStatsCountData == 0 ) {
1184 if (0.75 > double(m_iStatsCountAlNumChars)/double(m_iTestDataSize) ) {
1185 return false;
1186 }
1187 return ( NStr::Find( m_pTestBuffer, "|" ) <= 10 );
1188 }
1189
1190 // remaining decision based on text stats:
1191 double dAlNumFraction = (double)m_iStatsCountAlNumChars / m_iTestDataSize;
1192 double dDnaFraction = (double)m_iStatsCountDnaChars / m_iStatsCountData;
1193 double dAaFraction = (double)m_iStatsCountAaChars / m_iStatsCountData;
1194
1195 // want at least 80% text-ish overall:
1196 if ( dAlNumFraction < 0.8 ) {
1197 return false;
1198 }
1199
1200 // want more than 91 percent of either DNA content or AA content in what we
1201 // presume is data:
1202 if ( dDnaFraction > 0.91 || dAaFraction > 0.91 ) {
1203 return true;
1204 }
1205 return false;
1206 }
1207
1208 // ----------------------------------------------------------------------------
1209 bool
1210 CFormatGuess::TestFormatTextAsn(
1211 EMode /* not used */ )
1212 {
1213 if ( ! EnsureStats() ) {
1214 return false;
1215 }
1216
1217 // reject obvious misfits:
1218 if ( m_iTestDataSize == 0 || m_pTestBuffer[0] == '>' ) {
1219 return false;
1220 }
1221
1222 // criteria:
1223 // at least 80% text-ish,
1224 // "::=" as the 2nd field of the first non-blank non comment line.
1225 //
1226 double dAlNumFraction = (double)m_iStatsCountAlNumChars / m_iTestDataSize;
1227 if ( dAlNumFraction < 0.80 ) {
1228 return false;
1229 }
1230
1231 CNcbiIstrstream TestBuffer(
1232 reinterpret_cast<const char*>( m_pTestBuffer ), m_iTestDataSize );
1233 string strLine;
1234
1235 while ( ! TestBuffer.fail() ) {
1236 vector<string> Fields;
1237 NcbiGetline( TestBuffer, strLine, "\n\r" );
1238 NStr::Tokenize( strLine, " \t", Fields, NStr::eMergeDelims );
1239 if ( IsAsnComment( Fields ) ) {
1240 continue;
1241 }
1242 return ( Fields.size() >= 2 && Fields[1] == "::=" );
1243 }
1244 return false;
1245 }
1246
1247 // -----------------------------------------------------------------------------
1248 bool
1249 CFormatGuess::TestFormatTaxplot(
1250 EMode /* not used */ )
1251 {
1252 return false;
1253 }
1254
1255 // -----------------------------------------------------------------------------
1256 bool
1257 CFormatGuess::TestFormatSnpMarkers(
1258 EMode /* not used */ )
1259 {
1260 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1261 return false;
1262 }
1263 ITERATE( list<string>, it, m_TestLines ) {
1264 string str = *it;
1265 int rsid, chr, pos, numMatched;
1266 numMatched = sscanf( it->c_str(), "rs%d\t%d\t%d", &rsid, &chr, &pos);
1267 if ( numMatched == 3) {
1268 return true;
1269 }
1270 }
1271 return false;
1272 }
1273
1274
1275 // ----------------------------------------------------------------------------
1276 bool
1277 CFormatGuess::TestFormatBed(
1278 EMode /* not used */ )
1279 {
1280 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1281 return false;
1282 }
1283
1284 bool bTrackLineFound( false );
1285 bool bHasStartAndStop ( false );
1286 size_t columncount = 0;
1287 ITERATE( list<string>, it, m_TestLines ) {
1288 string str = NStr::TruncateSpaces( *it );
1289 if ( str.empty() ) {
1290 continue;
1291 }
1292
1293 // 'chr 8' fixup, the bedreader does this too
1294 if (str.find("chr ") == 0 ||
1295 str.find("Chr ") == 0 ||
1296 str.find("CHR ") == 0)
1297 str.erase(3, 1);
1298
1299 //
1300 // while occurrence of the following decorations _is_ a good sign, they could
1301 // also be indicator for a variety of other UCSC data formats
1302 //
1303 if ( NStr::StartsWith( str, "track" ) ) {
1304 bTrackLineFound = true;
1305 continue;
1306 }
1307 if ( NStr::StartsWith( str, "browser" ) ) {
1308 continue;
1309 }
1310 if ( NStr::StartsWith( str, "#" ) ) {
1311 continue;
1312 }
1313
1314 vector<string> columns;
1315 NStr::Tokenize( str, " \t", columns, NStr::eMergeDelims );
1316 if (columns.size() < 3 || columns.size() > 12) {
1317 return false;
1318 }
1319 if ( columns.size() != columncount ) {
1320 if ( columncount == 0 ) {
1321 columncount = columns.size();
1322 }
1323 else {
1324 return false;
1325 }
1326 }
1327 if(columns.size() >= 3) {
1328 if (s_IsTokenPosInt(columns[1]) &&
1329 s_IsTokenPosInt(columns[2])) {
1330 bHasStartAndStop = true;
1331 }
1332 }
1333 }
1334
1335 return (bHasStartAndStop || bTrackLineFound);
1336 }
1337
1338 // ----------------------------------------------------------------------------
1339 bool
1340 CFormatGuess::TestFormatBed15(
1341 EMode /* not used */ )
1342 {
1343 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1344 return false;
1345 }
1346
1347 bool LineFound = false;
1348 size_t columncount = 15;
1349 ITERATE( list<string>, it, m_TestLines ) {
1350 if ( NStr::TruncateSpaces( *it ).empty() ) {
1351 continue;
1352 }
1353 //
1354 // while occurrence of the following decorations _is_ a good sign, they could
1355 // also be indicator for a variety of other UCSC data formats
1356 //
1357 if ( NStr::StartsWith( *it, "track" ) ) {
1358 continue;
1359 }
1360 if ( NStr::StartsWith( *it, "browser" ) ) {
1361 continue;
1362 }
1363 if ( NStr::StartsWith( *it, "#" ) ) {
1364 continue;
1365 }
1366
1367 vector<string> columns;
1368 NStr::Tokenize( *it, " \t", columns, NStr::eMergeDelims );
1369 if ( columns.size() != columncount ) {
1370 return false;
1371 } else {
1372 if (!s_IsTokenPosInt(columns[1]) || //chr start
1373 !s_IsTokenPosInt(columns[2]) || //chr end
1374 !s_IsTokenPosInt(columns[4]) || //score
1375 !s_IsTokenPosInt(columns[6]) || //thick draw start
1376 !s_IsTokenPosInt(columns[7])) //thick draw end
1377 return false;
1378 string strand = NStr::TruncateSpaces(columns[5]);
1379
1380 if (strand != "+" && strand != "-")
1381 return false;
1382
1383 LineFound = true;
1384 }
1385 }
1386 return LineFound;
1387 }
1388
1389 // ----------------------------------------------------------------------------
1390 bool
1391 CFormatGuess::TestFormatWiggle(
1392 EMode /* not used */ )
1393 {
1394 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1395 return false;
1396 }
1397 ITERATE( list<string>, it, m_TestLines ) {
1398 if ( NStr::StartsWith( *it, "track" ) ) {
1399 if ( NStr::Find( *it, "type=wiggle_0" ) != NPOS ) {
1400 return true;
1401 }
1402 if ( NStr::Find( *it, "type=bedGraph" ) != NPOS ) {
1403 return true;
1404 }
1405 }
1406 if ( NStr::StartsWith(*it, "fixedStep") ) { /* MSS-140 */
1407 if ( NStr::Find(*it, "chrom=") && NStr::Find(*it, "start=") ) {
1408 return true;
1409 }
1410 }
1411 if ( NStr::StartsWith(*it, "variableStep") ) { /* MSS-140 */
1412 if ( NStr::Find(*it, "chrom=") ) {
1413 return true;
1414 }
1415 return true;
1416 }
1417 }
1418 return false;
1419 }
1420
1421 // ----------------------------------------------------------------------------
1422 bool
1423 CFormatGuess::TestFormatHgvs(
1424 EMode /* not used */ )
1425 {
1426 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1427 return false;
1428 }
1429
1430 unsigned int uHgvsLineCount = 0;
1431 list<string>::iterator it = m_TestLines.begin();
1432
1433 for ( ; it != m_TestLines.end(); ++it) {
1434 if ( it->empty() || (*it)[0] == '#' ) {
1435 continue;
1436 }
1437 if ( ! IsLineHgvs( *it ) ) {
1438 return false;
1439 }
1440 ++uHgvsLineCount;
1441 }
1442 return (uHgvsLineCount != 0);
1443 }
1444
1445
1446 // ----------------------------------------------------------------------------
1447 bool
1448 CFormatGuess::TestFormatZip(
1449 EMode /* not used */ )
1450 {
1451 if ( ! EnsureTestBuffer() ) {
1452 return false;
1453 }
1454
1455 // check if the first two bytes match with the zip magic number: 0x504B,
1456 // or BK and the next two bytes match with any of 0x0102, 0x0304, 0x0506
1457 // and 0x0708.
1458 if ( m_iTestDataSize < 4) {
1459 return false;
1460 }
1461
1462 if (m_pTestBuffer[0] == 'P' && m_pTestBuffer[1] == 'K' &&
1463 ((m_pTestBuffer[2] == (char)1 && m_pTestBuffer[3] == (char)2) ||
1464 (m_pTestBuffer[2] == (char)3 && m_pTestBuffer[3] == (char)4) ||
1465 (m_pTestBuffer[2] == (char)5 && m_pTestBuffer[3] == (char)6) ||
1466 (m_pTestBuffer[2] == (char)7 && m_pTestBuffer[3] == (char)8) ) ) {
1467 return true;
1468 }
1469
1470 return false;
1471 }
1472
1473
1474 // ----------------------------------------------------------------------------
1475 bool
1476 CFormatGuess::TestFormatGZip(
1477 EMode /* not used */ )
1478 {
1479 if ( ! EnsureTestBuffer() ) {
1480 return false;
1481 }
1482
1483 // check if the first two bytes match the gzip magic number: 0x1F8B
1484 if ( m_iTestDataSize < 2) {
1485 return false;
1486 }
1487
1488 if (m_pTestBuffer[0] == (char)31 && m_pTestBuffer[1] == (char)139) {
1489 return true;
1490 }
1491
1492 return false;
1493 }
1494
1495
1496 // ----------------------------------------------------------------------------
1497 bool
1498 CFormatGuess::TestFormatBZip2(
1499 EMode /* not used */ )
1500 {
1501 if ( ! EnsureTestBuffer() ) {
1502 return false;
1503 }
1504
1505 // check if the first two bytes match with the bzip2 magic number: 0x425A,
1506 // or 'BZ' and the next two bytes match with 0x68(h) and 0x31-39(1-9)
1507 if ( m_iTestDataSize < 4) {
1508 return false;
1509 }
1510
1511 if (m_pTestBuffer[0] == 'B' && m_pTestBuffer[1] == 'Z' &&
1512 m_pTestBuffer[2] == 'h' && m_pTestBuffer[3] >= '1' &&
1513 m_pTestBuffer[3] <= '9') {
1514 return true;
1515 }
1516
1517 return false;
1518 }
1519
1520
1521 // ----------------------------------------------------------------------------
1522 bool
1523 CFormatGuess::TestFormatLzo(
1524 EMode /* not used */ )
1525 {
1526 if ( ! EnsureTestBuffer() ) {
1527 return false;
1528 }
1529
1530 if (m_iTestDataSize >= 3 && m_pTestBuffer[0] == 'L' &&
1531 m_pTestBuffer[1] == 'Z' && m_pTestBuffer[2] == 'O') {
1532 if (m_iTestDataSize == 3 ||
1533 (m_iTestDataSize > 3 && m_pTestBuffer[3] == '\0')) {
1534 return true;
1535 }
1536 }
1537
1538 if (m_iTestDataSize >= 4 && m_pTestBuffer[1] == 'L' &&
1539 m_pTestBuffer[2] == 'Z' && m_pTestBuffer[3] == 'O') {
1540 if (m_iTestDataSize == 4 ||
1541 (m_iTestDataSize > 4 && m_pTestBuffer[4] == '\0')) {
1542 return true;
1543 }
1544 }
1545
1546 return false;
1547 }
1548
1549
1550 bool CFormatGuess::TestFormatSra(EMode /* not used */ )
1551 {
1552 if ( !EnsureTestBuffer() || m_iTestDataSize < 16
1553 || CTempString(m_pTestBuffer, 8) != "NCBI.sra") {
1554 return false;
1555 }
1556
1557 if (m_pTestBuffer[8] == '\x05' && m_pTestBuffer[9] == '\x03'
1558 && m_pTestBuffer[10] == '\x19' && m_pTestBuffer[11] == '\x88') {
1559 return true;
1560 } else if (m_pTestBuffer[8] == '\x88' && m_pTestBuffer[9] == '\x19'
1561 && m_pTestBuffer[10] == '\x03' && m_pTestBuffer[11] == '\x05') {
1562 return true;
1563 } else {
1564 return false;
1565 }
1566 }
1567
1568 bool CFormatGuess::TestFormatBam(EMode mode)
1569 {
1570 // Check for a gzip header whose first (only) extra field spans
1571 // at least six bytes and has the tag BC.
1572 return (TestFormatGZip(mode) && m_iTestDataSize >= 18
1573 && (m_pTestBuffer[3] & 4) != 0 // extra field present
1574 && (static_cast<unsigned char>(m_pTestBuffer[10]) >= 6
1575 || m_pTestBuffer[11] != 0) // at least six bytes
1576 && m_pTestBuffer[12] == 'B' && m_pTestBuffer[13] == 'C');
1577 }
1578
1579 // ----------------------------------------------------------------------------
1580 bool CFormatGuess::TestFormatVcf(
1581 EMode)
1582 // ----------------------------------------------------------------------------
1583 {
1584 // Currently, only look for the header line identifying the VCF version.
1585 // Waive requirement this be the first line, but still expect it to by
1586 // in the initial sample.
1587 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1588 return false;
1589 }
1590
1591 ITERATE( list<string>, it, m_TestLines ) {
1592 if (NStr::StartsWith(*it, "##fileformat=VCFv")) {
1593 return true;
1594 }
1595 }
1596 return false;
1597 }
1598
1599 // ----------------------------------------------------------------------------
1600 bool CFormatGuess::IsInputRepeatMaskerWithHeader()
1601 {
1602 //
1603 // Repeatmasker files consist of columnar data with a couple of lines
1604 // of column labels prepended to it (but sometimes someone strips those
1605 // labels).
1606 // This function tries to identify repeatmasker data by those column
1607 // label lines. They should be the first non-blanks in the file.
1608 //
1609 string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
1610 string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
1611
1612 //
1613 // Purge junk lines:
1614 //
1615 list<string>::iterator it = m_TestLines.begin();
1616 for ( ; it != m_TestLines.end(); ++it ) {
1617 NStr::TruncateSpacesInPlace( *it );
1618 if ( *it != "" ) {
1619 break;
1620 }
1621 }
1622
1623 if ( it == m_TestLines.end() ) {
1624 return false;
1625 }
1626
1627 //
1628 // Verify first line of labels:
1629 //
1630 size_t current_offset = 0;
1631 for ( size_t i=0; labels_1st_line[i] != ""; ++i ) {
1632 current_offset = NStr::FindCase( *it, labels_1st_line[i], current_offset );
1633 if ( current_offset == NPOS ) {
1634 return false;
1635 }
1636 }
1637
1638 //
1639 // Verify second line of labels:
1640 //
1641 ++it;
1642 if ( it == m_TestLines.end() ) {
1643 return false;
1644 }
1645 current_offset = 0;
1646 for ( size_t j=0; labels_2nd_line[j] != ""; ++j ) {
1647 current_offset = NStr::FindCase( *it, labels_2nd_line[j], current_offset );
1648 if ( current_offset == NPOS ) {
1649 return false;
1650 }
1651 }
1652
1653 //
1654 // Should have at least one extra line:
1655 //
1656 ++it;
1657 if ( it == m_TestLines.end() ) {
1658 return false;
1659 }
1660
1661 return true;
1662 }
1663
1664
1665 // ----------------------------------------------------------------------------
1666 bool CFormatGuess::IsInputRepeatMaskerWithoutHeader()
1667 {
1668 //
1669 // Repeatmasker files consist of columnar data with a couple of lines
1670 // of column labels prepended to it (but sometimes someone strips those
1671 // labels).
1672 // This function assumes the column labels have been stripped and attempts
1673 // to identify RMO by checking the data itself.
1674 //
1675
1676 //
1677 // We declare the data as RMO if we are able to parse every record in the
1678 // sample we got:
1679 //
1680 ITERATE( list<string>, it, m_TestLines ) {
1681 string str = NStr::TruncateSpaces( *it );
1682 if ( str == "" ) {
1683 continue;
1684 }
1685 if ( ! IsLineRmo( str ) ) {
1686 return false;
1687 }
1688 }
1689
1690 return true;
1691 }
1692
1693
1694 // ----------------------------------------------------------------------------
1695 bool
1696 CFormatGuess::IsSampleNewick(
1697 const string& cline )
1698 // ----------------------------------------------------------------------------
1699 {
1700 // NOTE:
1701 // See http://evolution.genetics.washington.edu/phylip/newick_doc.html
1702 //
1703 // Note that Newick tree tend to be written out as a single long line. Thus,
1704 // we are most likely only seeing the first part of a tree.
1705 //
1706
1707 // NOTE:
1708 // MSS-112 introduced the concept of multitree files is which after the ";"
1709 // another tree may start. The new logic accepts files as Newick if they
1710 // are Newick up to and including the first semicolon. It does not look
1711 // beyond.
1712
1713 string line = NStr::TruncateSpaces( cline );
1714 if ( line.empty() || line[0] != '(') {
1715 return false;
1716 }
1717 {{
1718 // Strip out comments:
1719 string trimmed;
1720 bool in_comment = false;
1721 for ( size_t ii=0; line.c_str()[ii] != 0; ++ii ) {
1722 if ( ! in_comment ) {
1723 if ( line.c_str()[ii] != '[' ) {
1724 trimmed += line.c_str()[ii];
1725 }
1726 else {
1727 in_comment = true;
1728 }
1729 }
1730 else /* in_comment */ {
1731 if ( line.c_str()[ii] == ']' ) {
1732 in_comment = false;
1733 }
1734 }
1735 }
1736 line = trimmed;
1737 }}
1738 {{
1739 // Compress quoted labels:
1740 string trimmed;
1741 bool in_quote = false;
1742 for ( size_t ii=0; line.c_str()[ii] != 0; ++ii ) {
1743 if ( ! in_quote ) {
1744 if ( line.c_str()[ii] != '\'' ) {
1745 trimmed += line.c_str()[ii];
1746 }
1747 else {
1748 in_quote = true;
1749 trimmed += 'A';
1750 }
1751 }
1752 else { /* in_quote */
1753 if ( line.c_str()[ii] == '\'' ) {
1754 in_quote = false;
1755 }
1756 }
1757 }
1758 line = trimmed;
1759 }}
1760 {{
1761 // Strip distance markers:
1762 string trimmed;
1763 size_t ii=0;
1764 while ( line.c_str()[ii] != 0 ) {
1765 if ( line.c_str()[ii] != ':' ) {
1766 trimmed += line.c_str()[ii++];
1767 }
1768 else {
1769 ii++;
1770 if ( line.c_str()[ii] == '-' || line.c_str()[ii] == '+' ) {
1771 ii++;
1772 }
1773 while ( '0' <= line.c_str()[ii] && line.c_str()[ii] <= '9' ) {
1774 ii++;
1775 }
1776 if ( line.c_str()[ii] == '.' ) {
1777 ii++;
1778 while ( '0' <= line.c_str()[ii] && line.c_str()[ii] <= '9' ) {
1779 ii++;
1780 }
1781 }
1782 }
1783 }
1784 line = trimmed;
1785 }}
1786 {{
1787 // Rough lexical analysis of what's left. Bail immediately on fault:
1788 if (line.empty() || line[0] != '(') {
1789 return false;
1790 }
1791 size_t paren_count = 1;
1792 for ( size_t ii=1; line.c_str()[ii] != 0; ++ii ) {
1793 switch ( line.c_str()[ii] ) {
1794 default:
1795 break;
1796 case '(':
1797 ++paren_count;
1798 break;
1799 case ')':
1800 if ( paren_count == 0 ) {
1801 return false;
1802 }
1803 --paren_count;
1804 break;
1805 case ',':
1806 if ( paren_count == 0 ) {
1807 return false;
1808 }
1809 break;
1810 case ';':
1811 // if ( line[ii+1] != 0 ) {
1812 // return false;
1813 // }
1814 break;
1815 }
1816 }
1817 }}
1818 return true;
1819 }
1820
1821
1822 // ----------------------------------------------------------------------------
1823 bool CFormatGuess::IsLineFlatFileSequence(
1824 const string& line )
1825 {
1826 // blocks of ten residues (or permitted punctuation characters)
1827 // with a count at the start or end; require at least four
1828 // (normally six)
1829 SIZE_TYPE pos = line.find_first_not_of("0123456789 \t");
1830 if (pos == NPOS || pos + 45 >= line.size()) {
1831 return false;
1832 }
1833
1834 for (SIZE_TYPE i = 0; i < 45; ++i) {
1835 char c = line[pos + i];
1836 if (i % 11 == 10) {
1837 if ( !isspace(c) ) {
1838 return false;
1839 }
1840 } else {
1841 if ( !isalpha(c) && c != '-' && c != '*') {
1842 return false;
1843 }
1844 }
1845 }
1846
1847 return true;
1848 }
1849
1850
1851 // ----------------------------------------------------------------------------
1852 bool CFormatGuess::IsLabelNewick(
1853 const string& label )
1854 {
1855 // Starts with a string of anything other than "[]:", optionally followed by
1856 // a single ':', followed by a number, optionally followed by a dot and
1857 // another number.
1858 if ( NPOS != label.find_first_of( "[]" ) ) {
1859 return false;
1860 }
1861 size_t colon = label.find( ':' );
1862 if ( NPOS == colon ) {
1863 return true;
1864 }
1865 size_t dot = label.find_first_not_of( "0123456789", colon + 1 );
1866 if ( NPOS == dot ) {
1867 return true;
1868 }
1869 if ( label[ dot ] != '.' ) {
1870 return false;
1871 }
1872 size_t end = label.find_first_not_of( "0123456789", dot + 1 );
1873 return ( NPOS == end );
1874 }
1875
1876
1877 // ----------------------------------------------------------------------------
1878 bool CFormatGuess::IsLineAgp(
1879 const string& strLine )
1880 {
1881 //
1882 // Note: The reader allows for line and endline comments starting with a '#'.
1883 // So we accept them here, too.
1884 //
1885 string line( strLine );
1886 size_t uCommentStart = NStr::Find( line, "#" );
1887
1888 if ( NPOS != uCommentStart ) {
1889 line = line.substr( 0, uCommentStart );
1890 }
1891 NStr::TruncateSpacesInPlace( line );
1892 if ( line.empty() ) {
1893 return true;
1894 }
1895
1896 vector<string> tokens;
1897 if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
1898 return false;
1899 }
1900
1901 if ( tokens[1].size() > 1 && tokens[1][0] == '-' ) {
1902 tokens[1][0] = '1';
1903 }
1904 if ( -1 == NStr::StringToNonNegativeInt( tokens[1] ) ) {
1905 return false;
1906 }
1907
1908 if ( tokens[2].size() > 1 && tokens[2][0] == '-' ) {
1909 tokens[2][0] = '1';
1910 }
1911 if ( -1 == NStr::StringToNonNegativeInt( tokens[2] ) ) {
1912 return false;
1913 }
1914
1915 if ( tokens[3].size() > 1 && tokens[3][0] == '-' ) {
1916 tokens[3][0] = '1';
1917 }
1918 if ( -1 == NStr::StringToNonNegativeInt( tokens[3] ) ) {
1919 return false;
1920 }
1921
1922 if ( tokens[4].size() != 1 || NPOS == tokens[4].find_first_of( "ADFGPNOW" ) ) {
1923 return false;
1924 }
1925 if ( tokens[4] == "N" ) {
1926 if ( -1 == NStr::StringToNonNegativeInt( tokens[5] ) ) {
1927 return false;
1928 }
1929 }
1930 else {
1931 if ( -1 == NStr::StringToNonNegativeInt( tokens[6] ) ) {
1932 return false;
1933 }
1934 if ( -1 == NStr::StringToNonNegativeInt( tokens[7] ) ) {
1935 return false;
1936 }
1937 if ( tokens.size() != 9 ) {
1938 return false;
1939 }
1940 if ( tokens[8].size() != 1 || NPOS == tokens[8].find_first_of( "+-" ) ) {
1941 return false;
1942 }
1943 }
1944
1945 return true;
1946 }
1947
1948
1949 // ----------------------------------------------------------------------------
1950 bool CFormatGuess::IsLineGlimmer3(
1951 const string& line )
1952 {
1953 list<string> toks;
1954 NStr::Split(line, "\t ", toks);
1955 if (toks.size() != 5) {
1956 return false;
1957 }
1958
1959 list<string>::iterator i = toks.begin();
1960
1961 /// first column: skip (ascii identifier)
1962 ++i;
1963
1964 /// second, third columns: both ints
1965 if ( ! s_IsTokenInteger( *i++ ) ) {
1966 return false;
1967 }
1968 if ( ! s_IsTokenInteger( *i++ ) ) {
1969 return false;
1970 }
1971
1972 /// fourth column: int in the range of -3...3
1973 if ( ! s_IsTokenInteger( *i ) ) {
1974 return false;
1975 }
1976 int frame = NStr::StringToInt( *i++ );
1977 if (frame < -3 || frame > 3) {
1978 return false;
1979 }
1980
1981 /// fifth column: score; double
1982 if ( ! s_IsTokenDouble( *i ) ) {
1983 return false;
1984 }
1985
1986 return true;
1987 }
1988
1989
1990 // ----------------------------------------------------------------------------
1991 bool CFormatGuess::IsLineGtf(
1992 const string& line )
1993 {
1994 vector<string> tokens;
1995 if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
1996 return false;
1997 }
1998 if ( ! s_IsTokenPosInt( tokens[3] ) ) {
1999 return false;
2000 }
2001 if ( ! s_IsTokenPosInt( tokens[4] ) ) {
2002 return false;
2003 }
2004 if ( ! s_IsTokenDouble( tokens[5] ) ) {
2005 return false;
2006 }
2007 if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
2008 return false;
2009 }
2010 if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
2011 return false;
2012 }
2013 if ( tokens.size() < 9 ||
2014 (NPOS == tokens[8].find( "gene_id" ) && NPOS == tokens[8].find( "transcript_id" ) ) ) {
2015 return false;
2016 }
2017 return true;
2018 }
2019
2020
2021 // ----------------------------------------------------------------------------
2022 bool CFormatGuess::IsLineGvf(
2023 const string& line )
2024 {
2025 vector<string> tokens;
2026 if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
2027 return false;
2028 }
2029 if ( ! s_IsTokenPosInt( tokens[3] ) ) {
2030 return false;
2031 }
2032 if ( ! s_IsTokenPosInt( tokens[4] ) ) {
2033 return false;
2034 }
2035 {{
2036 list<string> terms;
2037 terms.push_back("snv");
2038 terms.push_back("cnv");
2039 terms.push_back("copy_number_variation");
2040 terms.push_back("gain");
2041 terms.push_back("copy_number_gain");
2042 terms.push_back("loss");
2043 terms.push_back("copy_number_loss");
2044 terms.push_back("loss_of_heterozygosity");
2045 terms.push_back("complex");
2046 terms.push_back("complex_substitution");
2047 terms.push_back("complex_sequence_alteration");
2048 terms.push_back("indel");
2049 terms.push_back("insertion");
2050 terms.push_back("inversion");
2051 terms.push_back("substitution");
2052 terms.push_back("deletion");
2053 terms.push_back("duplication");
2054 terms.push_back("translocation");
2055 terms.push_back("upd");
2056 terms.push_back("uniparental_disomy");
2057 terms.push_back("maternal_uniparental_disomy");
2058 terms.push_back("paternal_uniparental_disomy");
2059 terms.push_back("tandom_duplication");
2060 terms.push_back("structural_variation");
2061 terms.push_back("sequence_alteration");
2062 ITERATE(list<string>, termiter, terms) {
2063 if(NStr::EqualNocase(*termiter, tokens[2]))
2064 return true;
2065 }
2066 }}
2067 if ( ! s_IsTokenDouble( tokens[5] ) ) {
2068 return false;
2069 }
2070 if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
2071 return false;
2072 }
2073 if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
2074 return false;
2075 }
2076 if(tokens.size() >= 9) {
2077 list<string> terms;
2078 terms.push_back("start_range");
2079 terms.push_back("end_range");
2080 terms.push_back("variant_seq");
2081 terms.push_back("genotype");
2082 ITERATE(list<string>, termiter, terms) {
2083 if(NStr::EqualNocase(*termiter, tokens[8]))
2084 return true;
2085 }
2086 }
2087
2088 return false;
2089 }
2090
2091
2092 // ----------------------------------------------------------------------------
2093 bool CFormatGuess::IsLineGff3(
2094 const string& line )
2095 {
2096 vector<string> tokens;
2097 if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
2098 return false;
2099 }
2100 if ( ! s_IsTokenPosInt( tokens[3] ) ) {
2101 return false;
2102 }
2103 if ( ! s_IsTokenPosInt( tokens[4] ) ) {
2104 return false;
2105 }
2106 if ( ! s_IsTokenDouble( tokens[5] ) ) {
2107 return false;
2108 }
2109 if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
2110 return false;
2111 }
2112 if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
2113 return false;
2114 }
2115 if ( tokens.size() < 9 || tokens[8].empty()) {
2116 return false;
2117 }
2118 if ( tokens.size() >= 9 && tokens[8].size() > 1) {
2119 const string& col9 = tokens[8];
2120 if ( NPOS == NStr::FindNoCase(col9, "ID") &&
2121 NPOS == NStr::FindNoCase(col9, "Parent") &&
2122 NPOS == NStr::FindNoCase(col9, "Target") &&
2123 NPOS == NStr::FindNoCase(col9, "Name") &&
2124 NPOS == NStr::FindNoCase(col9, "Alias") &&
2125 NPOS == NStr::FindNoCase(col9, "Note") &&
2126 NPOS == NStr::FindNoCase(col9, "Dbxref") &&
2127 NPOS == NStr::FindNoCase(col9, "Xref") ) {
2128 return false;
2129 }
2130 }
2131
2132 return true;
2133 }
2134
2135
2136 // ----------------------------------------------------------------------------
2137 bool CFormatGuess::IsLineGff2(
2138 const string& line )
2139 {
2140 vector<string> tokens;
2141 if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
2142 return false;
2143 }
2144 if ( ! s_IsTokenPosInt( tokens[3] ) ) {
2145 return false;
2146 }
2147 if ( ! s_IsTokenPosInt( tokens[4] ) ) {
2148 return false;
2149 }
2150 if ( ! s_IsTokenDouble( tokens[5] ) ) {
2151 return false;
2152 }
2153 if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
2154 return false;
2155 }
2156 if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
2157 return false;
2158 }
2159 return true;
2160 }
2161
2162
2163 // ----------------------------------------------------------------------------
2164 bool CFormatGuess::IsLinePhrapId(
2165 const string& line )
2166 {
2167 vector<string> values;
2168 if ( NStr::Tokenize( line, " \t", values, NStr::eMergeDelims ).empty() ) {
2169 return false;
2170 }
2171
2172 //
2173 // Old style: "^DNA \\w+ "
2174 //
2175 if ( values[0] == "DNA" ) {
2176 return true;
2177 }
2178
2179 //
2180 // New style: "^AS [0-9]+ [0-9]+"
2181 //
2182 if ( values[0] == "AS" ) {
2183 return ( 0 <= NStr::StringToNonNegativeInt( values[1] ) &&
2184 0 <= NStr::StringToNonNegativeInt( values[2] ) );
2185 }
2186
2187 return false;
2188 }
2189
2190
2191 // ----------------------------------------------------------------------------
2192 bool CFormatGuess::IsLineRmo(
2193 const string& line )
2194 {
2195 const size_t MIN_VALUES_PER_RECORD = 14;
2196
2197 //
2198 // Make sure there is enough stuff on that line:
2199 //
2200 list<string> values;
2201 if ( NStr::Split( line, " \t", values ).size() < MIN_VALUES_PER_RECORD ) {
2202 return false;
2203 }
2204
2205 //
2206 // Look at specific values and make sure they are of the correct type:
2207 //
2208
2209 // 1: positive integer:
2210 list<string>::iterator it = values.begin();
2211 if ( ! s_IsTokenPosInt( *it ) ) {
2212 return false;
2213 }
2214
2215 // 2: float:
2216 ++it;
2217 if ( ! s_IsTokenDouble( *it ) ) {
2218 return false;
2219 }
2220
2221 // 3: float:
2222 ++it;
2223 if ( ! s_IsTokenDouble( *it ) ) {
2224 return false;
2225 }
2226
2227 // 4: float:
2228 ++it;
2229 if ( ! s_IsTokenDouble( *it ) ) {
2230 return false;
2231 }
2232
2233 // 5: string, not checked
2234 ++it;
2235
2236 // 6: positive integer:
2237 ++it;
2238 if ( ! s_IsTokenPosInt( *it ) ) {
2239 return false;
2240 }
2241
2242 // 7: positive integer:
2243 ++it;
2244 if ( ! s_IsTokenPosInt( *it ) ) {
2245 return false;
2246 }
2247
2248 // 8: positive integer, likely in paretheses, not checked:
2249 ++it;
2250
2251 // 9: '+' or 'C':
2252 ++it;
2253 if ( *it != "+" && *it != "C" ) {
2254 return false;
2255 }
2256
2257 // and that's enough for now. But there are at least two more fields
2258 // with values that look testable.
2259
2260 return true;
2261 }
2262
2263
2264 // ----------------------------------------------------------------------------
2265 bool
2266 CFormatGuess::IsAsnComment(
2267 const vector<string>& Fields )
2268 {
2269 if ( Fields.size() == 0 ) {
2270 return true;
2271 }
2272 return ( NStr::StartsWith( Fields[0], "--" ) );
2273 }
2274
2275 // ----------------------------------------------------------------------------
2276 bool
2277 CFormatGuess::EnsureSplitLines()
2278 // ----------------------------------------------------------------------------
2279 {
2280 if ( m_bSplitDone ) {
2281 return !m_TestLines.empty();
2282 }
2283 m_bSplitDone = true;
2284
2285 //
2286 // Make sure the given data is ASCII before checking potential line breaks:
2287 //
2288 const size_t MIN_HIGH_RATIO = 20;
2289 size_t high_count = 0;
2290 for ( streamsize i=0; i < m_iTestDataSize; ++i ) {
2291 if ( 0x80 & m_pTestBuffer[i] ) {
2292 ++high_count;
2293 }
2294 }
2295 if ( 0 < high_count && m_iTestDataSize / high_count < MIN_HIGH_RATIO ) {
2296 return false;
2297 }
2298
2299 //
2300 // Let's expect at least one line break in the given data:
2301 //
2302 string data( m_pTestBuffer, (size_t)m_iTestDataSize );
2303 m_TestLines.clear();
2304
2305 if ( string::npos != data.find( "\r\n" ) ) {
2306 NStr::Split( data, "\r\n", m_TestLines );
2307 }
2308 else if ( string::npos != data.find( "\n" ) ) {
2309 NStr::Split( data, "\n", m_TestLines );
2310 }
2311 else if ( string::npos != data.find( "\r" ) ) {
2312 NStr::Split( data, "\r", m_TestLines );
2313 }
2314 else {
2315 //single truncated line
2316 return false;
2317 }
2318
2319 if ( m_iTestDataSize == s_iTestBufferSize && m_TestLines.size() > 1 ) {
2320 m_TestLines.pop_back();
2321 }
2322 return !m_TestLines.empty();
2323 }
2324
2325 // ----------------------------------------------------------------------------
2326 bool
2327 CFormatGuess::IsAllComment()
2328 {
2329 // first stab - are we text? comments are only valid if we are text
2330 size_t count = 0;
2331 size_t count_print = 0;
2332 for (int i = 0; i < m_iTestDataSize; ++i, ++count) {
2333 if (isprint((unsigned char) m_pTestBuffer[i])) {
2334 ++count_print;
2335 }
2336 }
2337 if (count_print < count * 0.9) {
2338 // 10% non-printing at least; likely not text
2339 return false;
2340 }
2341
2342 m_bSplitDone = false;
2343 m_TestLines.clear();
2344 EnsureSplitLines();
2345
2346 ITERATE(list<string>, it, m_TestLines) {
2347 if(it->empty()) {
2348 continue;
2349 }
2350 else if(NStr::StartsWith(*it, "#")) {
2351 continue;
2352 }
2353 else if(NStr::StartsWith(*it, "--")) {
2354 continue;
2355 }
2356 else {
2357 return false;
2358 }
2359 }
2360
2361 return true;
2362 }
2363
2364 // ----------------------------------------------------------------------------
2365 bool CFormatGuess::IsLineHgvs(
2366 const string& line )
2367 {
2368 // This simple check can mistake Newwick, so Newwick is checked first
2369 // /:(g|c|r|p|m|mt|n)\./ as in NC_000001.9:g.1234567C>T
2370 int State = 0;
2371 ITERATE(string, Iter, line) {
2372 char Char = *Iter;
2373 char Next = '\0';
2374 string::const_iterator NextI = Iter;
2375 ++NextI;
2376 if(NextI != line.end())
2377 Next = *NextI;
2378
2379 if(State == 0) {
2380 if(Char == ':')
2381 State = 1;
2382 } else if(State == 1) {
2383 if (Char == 'g' ||
2384 Char == 'c' ||
2385 Char == 'r' ||
2386 Char == 'p' ||
2387 Char == 'n' ||
2388 Char == 'm' ) {
2389 State = 2;
2390 if (Char=='m' && Next == 't') {
2391 ++Iter;
2392 }
2393 }
2394 } else if(State == 2) {
2395 if(Char == '.')
2396 State = 3;
2397 }
2398 }
2399
2400 return (State == 3);
2401 }
2402
2403
2404
2405 END_NCBI_SCOPE
2406 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |