|
NCBI Home IEB Home C++ Toolkit docs C Toolkit source browser C Toolkit source browser (2) |
NCBI C++ Toolkit Cross ReferenceC++/src/util/format_guess.cpp |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: format_guess.cpp 173835 2009-10-21 15:07:35Z ludwigf $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Anatoliy Kuznetsov
27 *
28 * File Description: Implemented methods to identify file formats.
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <util/format_guess.hpp>
34 #include <util/util_exception.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/stream_utils.hpp>
38
39 BEGIN_NCBI_SCOPE
40
41 enum ESymbolType {
42 fDNA_Alphabet = 1<<0,
43 fProtein_Alphabet = 1<<1,
44 fLineEnd = 1<<2,
45 fAlpha = 1<<3,
46 fDigit = 1<<4,
47 fSpace = 1<<5,
48 fInvalid = 1<<6
49 };
50
51 enum EConfidence {
52 eNo = 0,
53 eMaybe,
54 eYes
55 };
56
57
58 // ============================================================================
59 // Helper routine--- file scope only:
60 // ============================================================================
61
62 static unsigned char symbol_type_table[256];
63
64 // ----------------------------------------------------------------------------
65 static bool s_IsTokenPosInt(
66 const string& strToken )
67 {
68 return ( -1 != NStr::StringToNumeric( strToken ) );
69 }
70
71 // ----------------------------------------------------------------------------
72 static bool s_IsTokenInteger(
73 const string& strToken )
74 // ----------------------------------------------------------------------------
75 {
76 if ( ! strToken.empty() && strToken[0] == '-' ) {
77 return s_IsTokenPosInt( strToken.substr( 1 ) );
78 }
79 return s_IsTokenPosInt( strToken );
80 }
81
82 // ----------------------------------------------------------------------------
83 static bool s_IsTokenDouble(
84 const string& strToken )
85 {
86 string token( strToken );
87 NStr::ReplaceInPlace( token, ".", "1", 0, 1 );
88 if ( token.size() > 1 && token[0] == '-' ) {
89 token[0] = '1';
90 }
91 return s_IsTokenPosInt( token );
92 }
93
94 // ----------------------------------------------------------------------------
95 static void init_symbol_type_table(void)
96 {
97 if ( symbol_type_table[0] == 0 ) {
98 for ( const char* s = "ATGCN"; *s; ++s ) {
99 unsigned char c = *s;
100 symbol_type_table[c] |= fDNA_Alphabet;
101 c = tolower(c);
102 symbol_type_table[c] |= fDNA_Alphabet;
103 }
104 for ( const char* s = "ACDEFGHIKLMNPQRSTVWYBZX"; *s; ++s ) {
105 unsigned char c = *s;
106 symbol_type_table[c] |= fProtein_Alphabet;
107 c = tolower(c);
108 symbol_type_table[c] |= fProtein_Alphabet;
109 }
110 for ( const char* s = "\r\n"; *s; ++s ) {
111 unsigned char c = *s;
112 symbol_type_table[c] |= fLineEnd;
113 }
114 for ( int c = 1; c < 256; ++c ) {
115 if ( isalpha(c) )
116 symbol_type_table[c] |= fAlpha;
117 if ( isdigit(c) )
118 symbol_type_table[c] |= fDigit;
119 if ( isspace(c) )
120 symbol_type_table[c] |= fSpace;
121 }
122 symbol_type_table[0] |= fInvalid;
123 }
124 }
125
126
127 // ============================================================================
128 // Old style class interface:
129 // ============================================================================
130
131 // ----------------------------------------------------------------------------
132 CFormatGuess::ESequenceType
133 CFormatGuess::SequenceType(const char* str, unsigned length)
134 {
135 if (length == 0)
136 length = (unsigned)::strlen(str);
137
138 init_symbol_type_table();
139 unsigned ATGC_content = 0;
140 unsigned amino_acid_content = 0;
141
142 for (unsigned i = 0; i < length; ++i) {
143 unsigned char c = str[i];
144 unsigned char type = symbol_type_table[c];
145 if ( type & fDNA_Alphabet ) {
146 ++ATGC_content;
147 }
148 if ( type & fProtein_Alphabet ) {
149 ++amino_acid_content;
150 }
151 }
152
153 double dna_content = (double)ATGC_content / (double)length;
154 double prot_content = (double)amino_acid_content / (double)length;
155
156 if (dna_content > 0.7) {
157 return eNucleotide;
158 }
159 if (prot_content > 0.7) {
160 return eProtein;
161 }
162 return eUndefined;
163 }
164
165
166 // ----------------------------------------------------------------------------
167 CFormatGuess::EFormat CFormatGuess::Format(const string& path, EOnError onerror)
168 {
169 CNcbiIfstream input(path.c_str(), IOS_BASE::in | IOS_BASE::binary);
170 return Format(input);
171 }
172
173 // ----------------------------------------------------------------------------
174 CFormatGuess::EFormat CFormatGuess::Format(CNcbiIstream& input, EOnError onerror)
175 {
176 CFormatGuess FG( input );
177 return FG.GuessFormat( onerror );
178 }
179
180
181 // ============================================================================
182 // New style object interface:
183 // ============================================================================
184
185 // ----------------------------------------------------------------------------
186 CFormatGuess::CFormatGuess()
187 : m_Stream( * new CNcbiIfstream )
188 , m_bOwnsStream( true )
189 {
190 Initialize();
191 }
192
193 // ----------------------------------------------------------------------------
194 CFormatGuess::CFormatGuess(
195 const string& FileName )
196 : m_Stream( * new CNcbiIfstream( FileName.c_str() ) )
197 , m_bOwnsStream( true )
198 {
199 Initialize();
200 }
201
202 // ----------------------------------------------------------------------------
203 CFormatGuess::CFormatGuess(
204 CNcbiIstream& Stream )
205 : m_Stream( Stream )
206 , m_bOwnsStream( false )
207 {
208 Initialize();
209 }
210
211 // ----------------------------------------------------------------------------
212 CFormatGuess::~CFormatGuess()
213 {
214 delete[] m_pTestBuffer;
215 if ( m_bOwnsStream ) {
216 delete &m_Stream;
217 }
218 }
219
220 // ----------------------------------------------------------------------------
221 CFormatGuess::EFormat
222 CFormatGuess::GuessFormat( EMode )
223 {
224 return GuessFormat(eDefault);
225 }
226
227 // ----------------------------------------------------------------------------
228 CFormatGuess::EFormat
229 CFormatGuess::GuessFormat(
230 EOnError onerror )
231 {
232 if (!x_TestInput(m_Stream, onerror)) {
233 return eUnknown;
234 }
235 EMode mode = eQuick;
236
237 // First, try to use hints
238 if ( !m_Hints.IsEmpty() ) {
239 for (int f = 1 /* skip eUnknown */; f < eFormat_max; ++f) {
240 EFormat fmt = EFormat(f);
241 if (m_Hints.IsPreferred(fmt) && x_TestFormat(fmt, mode)) {
242 return fmt;
243 }
244 }
245 }
246
247 // Check other formats
248 for (int f = 1 /* skip eUnknown */; f < eFormat_max; ++f) {
249 if ( x_TestFormat(EFormat(f), mode) ) {
250 return EFormat(f);
251 }
252 }
253 return eUnknown;
254 }
255
256 // ----------------------------------------------------------------------------
257 bool
258 CFormatGuess::TestFormat( EFormat format, EMode )
259 {
260 return TestFormat( format, eDefault);
261 }
262
263 // ----------------------------------------------------------------------------
264 bool
265 CFormatGuess::TestFormat(
266 EFormat format,
267 EOnError onerror )
268 {
269 if (format != eUnknown && !x_TestInput(m_Stream, onerror)) {
270 return false;
271 }
272 EMode mode = eQuick;
273 return x_TestFormat(format, mode);
274 }
275
276 // ----------------------------------------------------------------------------
277 bool CFormatGuess::x_TestFormat(EFormat format, EMode mode)
278 {
279 // First check if the format is disabled
280 if ( m_Hints.IsDisabled(format) ) {
281 return false;
282 }
283
284 switch( format ) {
285
286 case eBinaryASN:
287 return TestFormatBinaryAsn( mode );
288 case eRmo:
289 return TestFormatRepeatMasker( mode );
290 case eGtf:
291 return TestFormatGtf( mode );
292 case eGlimmer3:
293 return TestFormatGlimmer3( mode );
294 case eAgp:
295 return TestFormatAgp( mode );
296 case eXml:
297 return TestFormatXml( mode );
298 case eWiggle:
299 return TestFormatWiggle( mode );
300 case eBed:
301 return TestFormatBed( mode );
302 case eBed15:
303 return TestFormatBed15( mode );
304 case eNewick:
305 return TestFormatNewick( mode );
306 case eAlignment:
307 return TestFormatAlignment( mode );
308 case eDistanceMatrix:
309 return TestFormatDistanceMatrix( mode );
310 case eFlatFileSequence:
311 return TestFormatFlatFileSequence( mode );
312 case eFiveColFeatureTable:
313 return TestFormatFiveColFeatureTable( mode );
314 case eSnpMarkers:
315 return TestFormatSnpMarkers( mode );
316 case eFasta:
317 return TestFormatFasta( mode );
318 case eTextASN:
319 return TestFormatTextAsn( mode );
320 case eTaxplot:
321 return TestFormatTaxplot( mode );
322 case ePhrapAce:
323 return TestFormatPhrapAce( mode );
324 case eTable:
325 return TestFormatTable( mode );
326
327 default:
328 NCBI_THROW( CCoreException, eInvalidArg,
329 "CFormatGuess::x_TestFormat(): Unsupported format ID." );
330 }
331 }
332
333 // ----------------------------------------------------------------------------
334 void
335 CFormatGuess::Initialize()
336 {
337 m_pTestBuffer = 0;
338
339 m_bStatsAreValid = false;
340 m_bSplitDone = false;
341 m_iStatsCountData = 0;
342 m_iStatsCountAlNumChars = 0;
343 m_iStatsCountDnaChars = 0;
344 m_iStatsCountAaChars = 0;
345 }
346
347 // ----------------------------------------------------------------------------
348 bool
349 CFormatGuess::EnsureTestBuffer()
350 {
351 if ( m_pTestBuffer ) {
352 return true;
353 }
354 if ( ! m_Stream.good() ) {
355 return false;
356 }
357 m_pTestBuffer = new char[ s_iTestBufferSize ];
358 m_Stream.read( m_pTestBuffer, s_iTestBufferSize );
359 m_iTestDataSize = m_Stream.gcount();
360 m_Stream.clear(); // in case we reached eof
361 CStreamUtils::Stepback( m_Stream, m_pTestBuffer, m_iTestDataSize );
362 return true;
363 }
364
365 // ----------------------------------------------------------------------------
366 bool
367 CFormatGuess::EnsureStats()
368 {
369 if ( m_bStatsAreValid ) {
370 return true;
371 }
372 if ( ! EnsureTestBuffer() ) {
373 return false;
374 }
375 if ( m_iTestDataSize == 0 ) {
376 m_bStatsAreValid = true;
377 return true;
378 }
379
380 CNcbiIstrstream TestBuffer(
381 reinterpret_cast<const char*>( m_pTestBuffer ), m_iTestDataSize );
382 string strLine;
383
384 init_symbol_type_table();
385 // Things we keep track of:
386 // m_iStatsCountAlNumChars: number of characters that are letters or
387 // digits
388 // m_iStatsCountData: number of characters not part of a line starting
389 // with '>'
390 // m_iStatsCountDnaChars: number of characters counted in m_iStatsCountData
391 // from the DNA alphabet
392 // m_iStatsCountAaChars: number of characters counted in m_iStatsCountData
393 // from the AA alphabet
394 //
395 while ( ! TestBuffer.fail() ) {
396 NcbiGetlineEOL( TestBuffer, strLine );
397 // code in CFormatGuess::Format counts line ends
398 // so, we will count them here as well
399 if (!strLine.empty()) {
400 strLine += '\n';
401 }
402 size_t size = strLine.size();
403 bool is_header = size > 0 && strLine[0] == '>';
404 for ( size_t i=0; i < size; ++i ) {
405 unsigned char c = strLine[i];
406 unsigned char type = symbol_type_table[c];
407
408 if ( type & (fAlpha | fDigit | fSpace) ) {
409 ++m_iStatsCountAlNumChars;
410 }
411 if ( !is_header ) {
412 ++m_iStatsCountData;
413
414 if ( type & fDNA_Alphabet ) {
415 ++m_iStatsCountDnaChars;
416 }
417 if ( type & fProtein_Alphabet ) {
418 ++m_iStatsCountAaChars;
419 }
420 if ( type & fLineEnd ) {
421 ++m_iStatsCountAlNumChars;
422 --m_iStatsCountData;
423 }
424 }
425 }
426 }
427 m_bStatsAreValid = true;
428 return true;
429 }
430
431 // ----------------------------------------------------------------------------
432 bool CFormatGuess::x_TestInput( CNcbiIstream& input, EOnError onerror )
433 {
434 if (!input) {
435 if (onerror == eThrowOnBadSource) {
436 NCBI_THROW(CUtilException,eNoInput,"Unreadable input stream");
437 }
438 return false;
439 }
440 return true;
441 }
442
443 // ----------------------------------------------------------------------------
444 bool
445 CFormatGuess::TestFormatRepeatMasker(
446 EMode /* not used */ )
447 {
448 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
449 return false;
450 }
451 return IsInputRepeatMaskerWithHeader() ||
452 IsInputRepeatMaskerWithoutHeader();
453 }
454
455 // ----------------------------------------------------------------------------
456 bool
457 CFormatGuess::TestFormatPhrapAce(
458 EMode /* not used */ )
459 {
460 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
461 return false;
462 }
463
464 ITERATE( list<string>, it, m_TestLines ) {
465 if ( IsLinePhrapId( *it ) ) {
466 return true;
467 }
468 }
469 return false;
470 }
471
472 // -----------------------------------------------------------------------------
473 bool
474 CFormatGuess::TestFormatGtf(
475 EMode /* not used */ )
476 {
477 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
478 return false;
479 }
480
481 unsigned int uGtfLineCount = 0;
482 list<string>::iterator it = m_TestLines.begin();
483 for ( ; it != m_TestLines.end(); ++it) {
484 if ( !it->empty() && (*it)[0] != '#') {
485 break;
486 }
487 }
488
489 for ( ; it != m_TestLines.end(); ++it) {
490 //
491 // Make sure to ignore any UCSC track and browser lines prior to the
492 // start of data
493 //
494 if ( !uGtfLineCount && NStr::StartsWith( *it, "browser " ) ) {
495 continue;
496 }
497 if ( !uGtfLineCount && NStr::StartsWith( *it, "track " ) ) {
498 continue;
499 }
500 if ( ! IsLineGtf( *it ) ) {
501 return false;
502 }
503 ++uGtfLineCount;
504 }
505 return (uGtfLineCount != 0);
506 }
507
508 // -----------------------------------------------------------------------------
509 bool
510 CFormatGuess::TestFormatGlimmer3(
511 EMode /* not used */ )
512 {
513 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
514 return false;
515 }
516
517 /// first line should be a FASTA defline
518 list<string>::iterator it = m_TestLines.begin();
519 if (it->empty() || (*it)[0] != '>') {
520 return false;
521 }
522
523 /// next lines should be easily parseable, with five columns
524 for (++it; it != m_TestLines.end(); ++it) {
525 if ( IsLineGlimmer3( *it ) ) {
526 return true;
527 }
528 }
529
530 return false;
531 }
532
533 // -----------------------------------------------------------------------------
534 bool
535 CFormatGuess::TestFormatAgp(
536 EMode /* not used */ )
537 {
538 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
539 return false;
540 }
541 ITERATE( list<string>, it, m_TestLines ) {
542 if ( ! IsLineAgp( *it ) ) {
543 return false;
544 }
545 }
546 return true;
547 }
548
549 // -----------------------------------------------------------------------------
550 bool
551 CFormatGuess::TestFormatNewick(
552 EMode /* not used */ )
553 {
554 if ( ! EnsureTestBuffer() ) {
555 return false;
556 }
557 // Maybe we get home early ...
558 if ( m_iTestDataSize > 0 && m_pTestBuffer[0] != '(' ) {
559 return false;
560 }
561 if ( ! EnsureSplitLines() ) {
562 return false;
563 }
564
565 string one_line;
566 ITERATE( list<string>, it, m_TestLines ) {
567 one_line += *it;
568 }
569
570 if ( ! IsLineNewick( one_line ) ) {
571 return false;
572 }
573 return true;
574 }
575
576 // -----------------------------------------------------------------------------
577 bool
578 CFormatGuess::TestFormatBinaryAsn(
579 EMode /* not used */ )
580 {
581 if ( ! EnsureTestBuffer() ) {
582 return false;
583 }
584
585 //
586 // Criterion: Presence of any non-printing characters
587 //
588 EConfidence conf = eNo;
589 for (int i = 0; i < m_iTestDataSize; ++i) {
590 if ( !isgraph((unsigned char) m_pTestBuffer[i]) &&
591 !isspace((unsigned char) m_pTestBuffer[i]) )
592 {
593 if (m_pTestBuffer[i] == '\1') {
594 conf = eMaybe;
595 } else {
596 return true;
597 }
598 }
599 }
600 return (conf == eYes);
601 }
602
603
604 // -----------------------------------------------------------------------------
605 bool
606 CFormatGuess::TestFormatDistanceMatrix(
607 EMode /* not used */ )
608 {
609 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
610 return false;
611 }
612
613 //
614 // criteria are odd:
615 //
616 list<string>::const_iterator iter = m_TestLines.begin();
617 list<string> toks;
618
619 /// first line: one token, one number
620 NStr::Split(*iter++, "\t ", toks);
621 if (toks.size() != 1 ||
622 toks.front().find_first_not_of("0123456789") != string::npos) {
623 return false;
624 }
625
626 // now, for remaining ones, we expect an alphanumeric item first,
627 // followed by a set of floating-point values. Unless we are at the last
628 // line, the number of values should increase monotonically
629 for (size_t i = 1; iter != m_TestLines.end(); ++i, ++iter) {
630 toks.clear();
631 NStr::Split(*iter, "\t ", toks);
632 if (toks.size() != i) {
633 /// we can ignore the last line ; it may be truncated
634 list<string>::const_iterator it = iter;
635 ++it;
636 if (it != m_TestLines.end()) {
637 return false;
638 }
639 }
640
641 list<string>::const_iterator it = toks.begin();
642 for (++it; it != toks.end(); ++it) {
643 if ( ! s_IsTokenDouble( *it ) ) {
644 return false;
645 }
646 }
647 }
648
649 return true;
650 }
651
652 // -----------------------------------------------------------------------------
653 bool
654 CFormatGuess::TestFormatFlatFileSequence(
655 EMode /* not used */ )
656 {
657 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
658 return false;
659 }
660
661 ITERATE (list<string>, it, m_TestLines) {
662 if ( ! IsLineFlatFileSequence( *it ) ) {
663 return false;
664 }
665 }
666 return true;
667 }
668
669 // -----------------------------------------------------------------------------
670 bool
671 CFormatGuess::TestFormatFiveColFeatureTable(
672 EMode /* not used */ )
673 {
674 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
675 return false;
676 }
677
678 ITERATE( list<string>, it, m_TestLines ) {
679 if (it->empty()) {
680 continue;
681 }
682
683 if (it->find(">Feature ") != 0) {
684 return false;
685 }
686 if (it->find_first_of(" \t", 9) != string::npos) {
687 return false;
688 }
689 break;
690 }
691
692 return true;
693 }
694
695 // -----------------------------------------------------------------------------
696 bool
697 CFormatGuess::TestFormatXml(
698 EMode /* not used */ )
699 {
700 if ( ! EnsureTestBuffer() ) {
701 return false;
702 }
703
704 string input( m_pTestBuffer, m_iTestDataSize );
705 NStr::TruncateSpacesInPlace( input, NStr::eTrunc_Begin );
706
707 //
708 // Test 1: If it starts with typical XML decorations such as "<?xml..."
709 // then respect that:
710 //
711 if ( NStr::StartsWith( input, "<?XML", NStr::eNocase ) ) {
712 return true;
713 }
714 if ( NStr::StartsWith( input, "<!DOCTYPE", NStr::eNocase ) ) {
715 return true;
716 }
717
718 //
719 // Test 2: In the absence of XML specific declarations, check whether the
720 // input starts with the opening tag of a well known set of doc types:
721 //
722 static const char* known_types[] = {
723 "<Blast4-request>"
724 };
725 const int num_types = sizeof( known_types ) / sizeof( const char* );
726
727 for ( int i=0; i < num_types; ++i ) {
728 if ( NStr::StartsWith( input, known_types[i], NStr::eCase ) ) {
729 return true;
730 }
731 }
732
733 return false;
734 }
735
736 // -----------------------------------------------------------------------------
737 bool
738 CFormatGuess::TestFormatAlignment(
739 EMode /* not used */ )
740 {
741 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
742 return false;
743 }
744
745 // Alignment files come in all different shapes and broken formats,
746 // and some of them are hard to recognize as such, in particular
747 // if they have been hacked up in a text editor.
748
749 // This functions only concerns itself with the ones that are
750 // easy to recognize.
751
752 // Note: We can live with false negatives. Avoid false positives
753 // at all cost.
754
755 ITERATE( list<string>, it, m_TestLines ) {
756 if ( NPOS != it->find( "#NEXUS" ) ) {
757 return true;
758 }
759 if ( NPOS != it->find( "CLUSTAL" ) ) {
760 return true;
761 }
762 }
763 return false;
764 }
765
766 // -----------------------------------------------------------------------------
767 bool
768 CFormatGuess::TestFormatTable(
769 EMode /* not used */ )
770 {
771 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
772 return false;
773 }
774
775 //
776 // NOTE 1:
777 // There is a bunch of file formats that are a special type of table and
778 // that we want to identify (like Repeat Masker output). So not to shade
779 // out those more special formats, this test should be performed only after
780 // all the more specialized table formats have been tested.
781 //
782
783 //
784 // NOTE 2:
785 // The original criterion for this test was "the same number of observed
786 // columns in every line".
787 // In order to weed out false positives the following *additional*
788 // conditions have been imposed:
789 // - there are at least two observed columns
790 // - the sample contains at least two non-comment lines.
791 //
792
793 list<string>::const_iterator iter = m_TestLines.begin();
794 list<string> toks;
795
796 /// determine the number of observed columns
797 size_t ncols = 0;
798 bool found = false;
799 for ( ; iter != m_TestLines.end() && ! found; ++iter) {
800 if (iter->empty() || (*iter)[0] == '#' || (*iter)[0] == ';') {
801 continue;
802 }
803
804 toks.clear();
805 NStr::Split(*iter, " \t,", toks);
806 ncols = toks.size();
807 found = true;
808 }
809 if ( ncols < 2 ) {
810 return false;
811 }
812
813 size_t nlines = 1;
814 // verify that columns all have the same size
815 // we can add an exception for the last line
816 for ( ; iter != m_TestLines.end(); ++iter) {
817 if (iter->empty() || (*iter)[0] == '#' || (*iter)[0] == ';') {
818 continue;
819 }
820
821 toks.clear();
822 NStr::Split(*iter, " \t,", toks);
823 if (toks.size() != ncols) {
824 list<string>::const_iterator it = iter;
825 ++it;
826 if (it != m_TestLines.end() || (m_iTestDataSize < s_iTestBufferSize) ) {
827 return false;
828 }
829 } else {
830 ++nlines;
831 }
832 }
833 return ( nlines >= 2 );
834 }
835
836 // -----------------------------------------------------------------------------
837 bool
838 CFormatGuess::TestFormatFasta(
839 EMode /* not used */ )
840 {
841 if ( ! EnsureStats() ) {
842 return false;
843 }
844
845 // reject obvious misfits:
846 if ( m_iTestDataSize == 0 || m_pTestBuffer[0] != '>' ) {
847 return false;
848 }
849 if ( m_iStatsCountData == 0 ) {
850 if (0.75 > double(m_iStatsCountAlNumChars)/double(m_iTestDataSize) ) {
851 return false;
852 }
853 return ( NStr::Find( m_pTestBuffer, "|" ) <= 10 );
854 }
855
856 // remaining decision based on text stats:
857 double dAlNumFraction = (double)m_iStatsCountAlNumChars / m_iTestDataSize;
858 double dDnaFraction = (double)m_iStatsCountDnaChars / m_iStatsCountData;
859 double dAaFraction = (double)m_iStatsCountAaChars / m_iStatsCountData;
860
861 // want at least 80% text-ish overall:
862 if ( dAlNumFraction < 0.8 ) {
863 return false;
864 }
865
866 // want more than 91 percent of either DNA content or AA content in what we
867 // presume is data:
868 if ( dDnaFraction > 0.91 || dAaFraction > 0.91 ) {
869 return true;
870 }
871 return false;
872 }
873
874 // ----------------------------------------------------------------------------
875 bool
876 CFormatGuess::TestFormatTextAsn(
877 EMode /* not used */ )
878 {
879 if ( ! EnsureStats() ) {
880 return false;
881 }
882
883 // reject obvious misfits:
884 if ( m_iTestDataSize == 0 || m_pTestBuffer[0] == '>' ) {
885 return false;
886 }
887
888 // criteria:
889 // at least 80% text-ish,
890 // "::=" as the 2nd field of the first non-blank non comment line.
891 //
892 double dAlNumFraction = (double)m_iStatsCountAlNumChars / m_iTestDataSize;
893 if ( dAlNumFraction < 0.80 ) {
894 return false;
895 }
896
897 CNcbiIstrstream TestBuffer(
898 reinterpret_cast<const char*>( m_pTestBuffer ), m_iTestDataSize );
899 string strLine;
900
901 while ( ! TestBuffer.fail() ) {
902 vector<string> Fields;
903 NcbiGetline( TestBuffer, strLine, "\n\r" );
904 NStr::Tokenize( strLine, " \t", Fields, NStr::eMergeDelims );
905 if ( IsAsnComment( Fields ) ) {
906 continue;
907 }
908 return ( Fields.size() >= 2 && Fields[1] == "::=" );
909 }
910 return false;
911 }
912
913 // -----------------------------------------------------------------------------
914 bool
915 CFormatGuess::TestFormatTaxplot(
916 EMode /* not used */ )
917 {
918 return false;
919 }
920
921 // -----------------------------------------------------------------------------
922 bool
923 CFormatGuess::TestFormatSnpMarkers(
924 EMode /* not used */ )
925 {
926 if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
927 return false;
928 }
929 ITERATE( list<string>, it, m_TestLines ) {
930 string str = *it;
931 int rsid, chr, pos, numMatched;
932 numMatched = sscanf( it->c_str(), "rs%d\t%d\t%d", &rsid, &chr, &pos);
933 if ( numMatched == 3) {
934 return true;
935 }
936 }
937 return false;
938 }
939
940
941 // ----------------------------------------------------------------------------
942 bool
943 CFormatGuess::TestFormatBed(
944 EMode /* not used */ )
945 {
946 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
947 return false;
948 }
949
950 bool bTrackLineFound( false );
951 size_t columncount = 0;
952 ITERATE( list<string>, it, m_TestLines ) {
953 string str = NStr::TruncateSpaces( *it );
954 if ( str.empty() ) {
955 continue;
956 }
957
958 //
959 // while occurrence of the following decorations _is_ a good sign, they could
960 // also be indicator for a variety of other UCSC data formats
961 //
962 if ( NStr::StartsWith( str, "track" ) ) {
963 bTrackLineFound = true;
964 continue;
965 }
966 if ( NStr::StartsWith( str, "browser" ) ) {
967 continue;
968 }
969 if ( NStr::StartsWith( str, "#" ) ) {
970 continue;
971 }
972
973 vector<string> columns;
974 NStr::Tokenize( str, " \t", columns, NStr::eMergeDelims );
975 if (columns.size() < 3 || columns.size() > 12) {
976 return false;
977 }
978 if ( columns.size() != columncount ) {
979 if ( columncount == 0 ) {
980 columncount = columns.size();
981 }
982 else {
983 return false;
984 }
985 }
986 }
987 return bTrackLineFound;
988 }
989
990 // ----------------------------------------------------------------------------
991 bool
992 CFormatGuess::TestFormatBed15(
993 EMode /* not used */ )
994 {
995 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
996 return false;
997 }
998
999 size_t columncount = 15;
1000 ITERATE( list<string>, it, m_TestLines ) {
1001 if ( NStr::TruncateSpaces( *it ).empty() ) {
1002 continue;
1003 }
1004 //
1005 // while occurrence of the following decorations _is_ a good sign, they could
1006 // also be indicator for a variety of other UCSC data formats
1007 //
1008 if ( NStr::StartsWith( *it, "track" ) ) {
1009 continue;
1010 }
1011 if ( NStr::StartsWith( *it, "browser" ) ) {
1012 continue;
1013 }
1014 if ( NStr::StartsWith( *it, "#" ) ) {
1015 continue;
1016 }
1017
1018 vector<string> columns;
1019 NStr::Tokenize( *it, " \t", columns, NStr::eMergeDelims );
1020 if ( columns.size() != columncount ) {
1021 return false;
1022 }
1023 }
1024 return true;
1025 }
1026
1027 // ----------------------------------------------------------------------------
1028 bool
1029 CFormatGuess::TestFormatWiggle(
1030 EMode /* not used */ )
1031 {
1032 if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1033 return false;
1034 }
1035 ITERATE( list<string>, it, m_TestLines ) {
1036 if ( NStr::StartsWith( *it, "track" ) ) {
1037 if ( NStr::Find( *it, "type=wiggle_0" ) != NPOS ) {
1038 return true;
1039 }
1040 if ( NStr::Find( *it, "type=bedGraph" ) != NPOS ) {
1041 return true;
1042 }
1043 }
1044 }
1045 return false;
1046 }
1047
1048
1049 // ----------------------------------------------------------------------------
1050 bool CFormatGuess::IsInputRepeatMaskerWithHeader()
1051 {
1052 //
1053 // Repeatmasker files consist of columnar data with a couple of lines
1054 // of column labels prepended to it (but sometimes someone strips those
1055 // labels).
1056 // This function tries to identify repeatmasker data by those column
1057 // label lines. They should be the first non-blanks in the file.
1058 //
1059 string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
1060 string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
1061
1062 //
1063 // Purge junk lines:
1064 //
1065 list<string>::iterator it = m_TestLines.begin();
1066 for ( ; it != m_TestLines.end(); ++it ) {
1067 NStr::TruncateSpacesInPlace( *it );
1068 if ( *it != "" ) {
1069 break;
1070 }
1071 }
1072
1073 if ( it == m_TestLines.end() ) {
1074 return false;
1075 }
1076
1077 //
1078 // Verify first line of labels:
1079 //
1080 size_t current_offset = 0;
1081 for ( size_t i=0; labels_1st_line[i] != ""; ++i ) {
1082 current_offset = NStr::FindCase( *it, labels_1st_line[i], current_offset );
1083 if ( current_offset == NPOS ) {
1084 return false;
1085 }
1086 }
1087
1088 //
1089 // Verify second line of labels:
1090 //
1091 ++it;
1092 if ( it == m_TestLines.end() ) {
1093 return false;
1094 }
1095 current_offset = 0;
1096 for ( size_t j=0; labels_2nd_line[j] != ""; ++j ) {
1097 current_offset = NStr::FindCase( *it, labels_2nd_line[j], current_offset );
1098 if ( current_offset == NPOS ) {
1099 return false;
1100 }
1101 }
1102
1103 //
1104 // Should have at least one extra line:
1105 //
1106 ++it;
1107 if ( it == m_TestLines.end() ) {
1108 return false;
1109 }
1110
1111 return true;
1112 }
1113
1114
1115 // ----------------------------------------------------------------------------
1116 bool CFormatGuess::IsInputRepeatMaskerWithoutHeader()
1117 {
1118 //
1119 // Repeatmasker files consist of columnar data with a couple of lines
1120 // of column labels prepended to it (but sometimes someone strips those
1121 // labels).
1122 // This function assumes the column labels have been stripped and attempts
1123 // to identify RMO by checking the data itself.
1124 //
1125
1126 //
1127 // We declare the data as RMO if we are able to parse every record in the
1128 // sample we got:
1129 //
1130 ITERATE( list<string>, it, m_TestLines ) {
1131 string str = NStr::TruncateSpaces( *it );
1132 if ( str == "" ) {
1133 continue;
1134 }
1135 if ( ! IsLineRmo( str ) ) {
1136 return false;
1137 }
1138 }
1139
1140 return true;
1141 }
1142
1143
1144 // ----------------------------------------------------------------------------
1145 bool
1146 CFormatGuess::IsLineNewick(
1147 const string& cline )
1148 {
1149 //
1150 // Note:
1151 // Newick lines are a little tricky. They contain tree structure of the form
1152 // (a,b), where each a or be can either be a another tree structure, or a
1153 // label of the form 'ABCD'. The trickiness comes from the fact that these
1154 // beasts are highly recursive, to the point that our 1k read buffer may not
1155 // even cover a single line in the file. Which means, we might only have a
1156 // partial line to work with.
1157 //
1158 // The test:
1159 // Throw away all the labels, i.e. everything between an odd-numbered ' and
1160 // an even numbered tick. After that, there should only remain '(', ')', ';',
1161 // ''', ',', or whitespace.
1162 // Moreover, if there is a semicolon, it must be at the end of the line.
1163 //
1164 string line = NStr::TruncateSpaces( cline );
1165 if ( line.empty() ) {
1166 return false;
1167 }
1168 string delimiters = " ,();";
1169 for ( size_t i=0; line[i] != 0; ++i ) {
1170
1171 if ( NPOS != delimiters.find( line[i] ) ) {
1172 if ( line[i] == ';' && i != line.size() - 1 ) {
1173 return false;
1174 }
1175 else {
1176 continue;
1177 }
1178 }
1179 if ( line[i] == '[' || line[i] == ']' ) {
1180 return false;
1181 }
1182 size_t label_end = line.find_first_of( delimiters, i );
1183 string label = line.substr( i, label_end - i );
1184 if ( ! IsLabelNewick( label ) ) {
1185 return false;
1186 }
1187 if ( NPOS == label_end ) {
1188 return true;
1189 }
1190 i = label_end;
1191 }
1192 return true;
1193 }
1194
1195
1196 // ----------------------------------------------------------------------------
1197 bool CFormatGuess::IsLineFlatFileSequence(
1198 const string& line )
1199 {
1200 // blocks of ten residues (or permitted punctuation characters)
1201 // with a count at the start or end; require at least four
1202 // (normally six)
1203 SIZE_TYPE pos = line.find_first_not_of("0123456789 \t");
1204 if (pos == NPOS || pos + 45 >= line.size()) {
1205 return false;
1206 }
1207
1208 for (SIZE_TYPE i = 0; i < 45; ++i) {
1209 char c = line[pos + i];
1210 if (i % 11 == 10) {
1211 if ( !isspace(c) ) {
1212 return false;
1213 }
1214 } else {
1215 if ( !isalpha(c) && c != '-' && c != '*') {
1216 return false;
1217 }
1218 }
1219 }
1220
1221 return true;
1222 }
1223
1224
1225 // ----------------------------------------------------------------------------
1226 bool CFormatGuess::IsLabelNewick(
1227 const string& label )
1228 {
1229 // Starts with a string of anything other than "[]:", optionally followed by
1230 // a single ':', followed by a number, optionally followed by a dot and
1231 // another number.
1232 if ( NPOS != label.find_first_of( "[]" ) ) {
1233 return false;
1234 }
1235 size_t colon = label.find( ':' );
1236 if ( NPOS == colon ) {
1237 return true;
1238 }
1239 size_t dot = label.find_first_not_of( "0123456789", colon + 1 );
1240 if ( NPOS == dot ) {
1241 return true;
1242 }
1243 if ( label[ dot ] != '.' ) {
1244 return false;
1245 }
1246 size_t end = label.find_first_not_of( "0123456789", dot + 1 );
1247 return ( NPOS == end );
1248 }
1249
1250
1251 // ----------------------------------------------------------------------------
1252 bool CFormatGuess::IsLineAgp(
1253 const string& strLine )
1254 {
1255 //
1256 // Note: The reader allows for line and endline comments starting with a '#'.
1257 // So we accept them here, too.
1258 //
1259 string line( strLine );
1260 size_t uCommentStart = NStr::Find( line, "#" );
1261
1262 if ( NPOS != uCommentStart ) {
1263 line = line.substr( 0, uCommentStart );
1264 }
1265 NStr::TruncateSpacesInPlace( line );
1266 if ( line.empty() ) {
1267 return true;
1268 }
1269
1270 vector<string> tokens;
1271 if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
1272 return false;
1273 }
1274
1275 if ( tokens[1].size() > 1 && tokens[1][0] == '-' ) {
1276 tokens[1][0] = '1';
1277 }
1278 if ( -1 == NStr::StringToNumeric( tokens[1] ) ) {
1279 return false;
1280 }
1281
1282 if ( tokens[2].size() > 1 && tokens[2][0] == '-' ) {
1283 tokens[2][0] = '1';
1284 }
1285 if ( -1 == NStr::StringToNumeric( tokens[2] ) ) {
1286 return false;
1287 }
1288
1289 if ( tokens[3].size() > 1 && tokens[3][0] == '-' ) {
1290 tokens[3][0] = '1';
1291 }
1292 if ( -1 == NStr::StringToNumeric( tokens[3] ) ) {
1293 return false;
1294 }
1295
1296 if ( tokens[4].size() != 1 || NPOS == tokens[4].find_first_of( "ADFGPNOW" ) ) {
1297 return false;
1298 }
1299 if ( tokens[4] == "N" ) {
1300 if ( -1 == NStr::StringToNumeric( tokens[5] ) ) {
1301 return false;
1302 }
1303 }
1304 else {
1305 if ( -1 == NStr::StringToNumeric( tokens[6] ) ) {
1306 return false;
1307 }
1308 if ( -1 == NStr::StringToNumeric( tokens[7] ) ) {
1309 return false;
1310 }
1311 if ( tokens.size() != 9 ) {
1312 return false;
1313 }
1314 if ( tokens[8].size() != 1 || NPOS == tokens[8].find_first_of( "+-" ) ) {
1315 return false;
1316 }
1317 }
1318
1319 return true;
1320 }
1321
1322
1323 // ----------------------------------------------------------------------------
1324 bool CFormatGuess::IsLineGlimmer3(
1325 const string& line )
1326 {
1327 list<string> toks;
1328 NStr::Split(line, "\t ", toks);
1329 if (toks.size() != 5) {
1330 return false;
1331 }
1332
1333 list<string>::iterator i = toks.begin();
1334
1335 /// first column: skip (ascii identifier)
1336 ++i;
1337
1338 /// second, third columns: both ints
1339 if ( ! s_IsTokenInteger( *i++ ) ) {
1340 return false;
1341 }
1342 if ( ! s_IsTokenInteger( *i++ ) ) {
1343 return false;
1344 }
1345
1346 /// fourth column: int in the range of -3...3
1347 if ( ! s_IsTokenInteger( *i ) ) {
1348 return false;
1349 }
1350 int frame = NStr::StringToInt( *i++ );
1351 if (frame < -3 || frame > 3) {
1352 return false;
1353 }
1354
1355 /// fifth column: score; double
1356 if ( ! s_IsTokenDouble( *i ) ) {
1357 return false;
1358 }
1359
1360 return true;
1361 }
1362
1363
1364 // ----------------------------------------------------------------------------
1365 bool CFormatGuess::IsLineGtf(
1366 const string& line )
1367 {
1368 vector<string> tokens;
1369 if ( NStr::Tokenize( line, " \t", tokens, NStr::eMergeDelims ).size() < 8 ) {
1370 return false;
1371 }
1372 if ( ! s_IsTokenPosInt( tokens[3] ) ) {
1373 return false;
1374 }
1375 if ( ! s_IsTokenPosInt( tokens[4] ) ) {
1376 return false;
1377 }
1378 if ( ! s_IsTokenDouble( tokens[5] ) ) {
1379 return false;
1380 }
1381 if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
1382 return false;
1383 }
1384 if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
1385 return false;
1386 }
1387 return true;
1388 }
1389
1390
1391 // ----------------------------------------------------------------------------
1392 bool CFormatGuess::IsLinePhrapId(
1393 const string& line )
1394 {
1395 vector<string> values;
1396 if ( NStr::Tokenize( line, " \t", values, NStr::eMergeDelims ).empty() ) {
1397 return false;
1398 }
1399
1400 //
1401 // Old style: "^DNA \\w+ "
1402 //
1403 if ( values[0] == "DNA" ) {
1404 return true;
1405 }
1406
1407 //
1408 // New style: "^AS [0-9]+ [0-9]+"
1409 //
1410 if ( values[0] == "AS" ) {
1411 return ( 0 <= NStr::StringToNumeric( values[1] ) &&
1412 0 <= NStr::StringToNumeric( values[2] ) );
1413 }
1414
1415 return false;
1416 }
1417
1418
1419 // ----------------------------------------------------------------------------
1420 bool CFormatGuess::IsLineRmo(
1421 const string& line )
1422 {
1423 const size_t MIN_VALUES_PER_RECORD = 15;
1424
1425 //
1426 // Make sure there is enough stuff on that line:
1427 //
1428 list<string> values;
1429 if ( NStr::Split( line, " \t", values ).size() < MIN_VALUES_PER_RECORD ) {
1430 return false;
1431 }
1432
1433 //
1434 // Look at specific values and make sure they are of the correct type:
1435 //
1436
1437 // 1: positive integer:
1438 list<string>::iterator it = values.begin();
1439 if ( ! s_IsTokenPosInt( *it ) ) {
1440 return false;
1441 }
1442
1443 // 2: float:
1444 ++it;
1445 if ( ! s_IsTokenDouble( *it ) ) {
1446 return false;
1447 }
1448
1449 // 3: float:
1450 ++it;
1451 if ( ! s_IsTokenDouble( *it ) ) {
1452 return false;
1453 }
1454
1455 // 4: float:
1456 ++it;
1457 if ( ! s_IsTokenDouble( *it ) ) {
1458 return false;
1459 }
1460
1461 // 5: string, not checked
1462 ++it;
1463
1464 // 6: positive integer:
1465 ++it;
1466 if ( ! s_IsTokenPosInt( *it ) ) {
1467 return false;
1468 }
1469
1470 // 7: positive integer:
1471 ++it;
1472 if ( ! s_IsTokenPosInt( *it ) ) {
1473 return false;
1474 }
1475
1476 // 8: positive integer, likely in paretheses, not checked:
1477 ++it;
1478
1479 // 9: '+' or 'C':
1480 ++it;
1481 if ( *it != "+" && *it != "C" ) {
1482 return false;
1483 }
1484
1485 // and that's enough for now. But there are at least two more fields
1486 // with values that look testable.
1487
1488 return true;
1489 }
1490
1491
1492 // ----------------------------------------------------------------------------
1493 bool
1494 CFormatGuess::IsAsnComment(
1495 const vector<string>& Fields )
1496 {
1497 if ( Fields.size() == 0 ) {
1498 return true;
1499 }
1500 return ( NStr::StartsWith( Fields[0], "--" ) );
1501 }
1502
1503 // ----------------------------------------------------------------------------
1504 bool
1505 CFormatGuess::EnsureSplitLines()
1506 // ----------------------------------------------------------------------------
1507 {
1508 if ( m_bSplitDone ) {
1509 return !m_TestLines.empty();
1510 }
1511 m_bSplitDone = true;
1512
1513 //
1514 // Make sure the given data is ASCII before checking potential line breaks:
1515 //
1516 const size_t MIN_HIGH_RATIO = 20;
1517 size_t high_count = 0;
1518 for ( streamsize i=0; i < m_iTestDataSize; ++i ) {
1519 if ( 0x80 & m_pTestBuffer[i] ) {
1520 ++high_count;
1521 }
1522 }
1523 if ( 0 < high_count && m_iTestDataSize / high_count < MIN_HIGH_RATIO ) {
1524 return false;
1525 }
1526
1527 //
1528 // Let's expect at least one line break in the given data:
1529 //
1530 string data( m_pTestBuffer, m_iTestDataSize );
1531
1532 m_TestLines.clear();
1533 if ( NStr::Split( data, "\r\n", m_TestLines ).size() <= 1 ) {
1534 m_TestLines.clear();
1535 if ( NStr::Split( data, "\r", m_TestLines ).size() <= 1 ) {
1536 m_TestLines.clear();
1537 NStr::Split( data, "\n", m_TestLines );
1538 }
1539 }
1540 if ( (m_TestLines.size() <= 1) && (m_iTestDataSize == s_iTestBufferSize) ) {
1541 //
1542 // single truncated line...
1543 //
1544 m_TestLines.clear();
1545 return false;
1546 }
1547
1548 if ( m_iTestDataSize == s_iTestBufferSize ) {
1549 m_TestLines.pop_back();
1550 }
1551 return !m_TestLines.empty();
1552 }
1553
1554 END_NCBI_SCOPE
1555 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |