NCBI C++ Toolkit Cross Reference

C++/src/util/line_reader.cpp


  1 /*  $Id: line_reader.cpp 162426 2009-06-05 15:13:09Z ucko $
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *               National Center for Biotechnology Information
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government have not placed any restriction on its use or reproduction.
 13 *
 14 *  Although all reasonable efforts have been taken to ensure the accuracy
 15 *  and reliability of the software and data, the NLM and the U.S.
 16 *  Government do not and cannot warrant the performance or results that
 17 *  may be obtained by using this software or data. The NLM and the U.S.
 18 *  Government disclaim all warranties, express or implied, including
 19 *  warranties of performance, merchantability or fitness for any particular
 20 *  purpose.
 21 *
 22 *  Please cite the author in any work or product based on this material.
 23 *
 24 * ===========================================================================
 25 *
 26 * Author:  Aaron Ucko, Anatoliy Kuznetsov
 27 *
 28 * File Description:
 29 *   Lightweight interface for getting lines of data with minimal
 30 *   memory copying.
 31 *
 32 * ===========================================================================
 33 */
 34 
 35 #include <ncbi_pch.hpp>
 36 #include <util/line_reader.hpp>
 37 #include <util/util_exception.hpp>
 38 #include <corelib/rwstream.hpp>
 39 #include <corelib/ncbifile.hpp>
 40 #include <corelib/stream_utils.hpp>
 41 
 42 #include <string.h>
 43 
 44 BEGIN_NCBI_SCOPE
 45 
 46 
 47 CRef<ILineReader> ILineReader::New(const string& filename)
 48 {
 49     CRef<ILineReader> lr;
 50     lr.Reset(new CBufferedLineReader(filename));
 51     return lr;
 52 }
 53 
 54 
 55 CStreamLineReader::CStreamLineReader(CNcbiIstream& is,
 56                                      EEOLStyle eol_style,
 57                                      EOwnership ownership)
 58     : m_Stream(&is, ownership), m_LineNumber(0), m_UngetLine(false),
 59       m_AutoEOL(eol_style == eEOL_unknown), m_EOLStyle(eol_style)
 60 {
 61 }
 62 
 63 
 64 CStreamLineReader::CStreamLineReader(CNcbiIstream& is,
 65                                      EOwnership ownership)
 66     : m_Stream(&is, ownership), m_LineNumber(0), m_UngetLine(false),
 67       m_AutoEOL(true), m_EOLStyle(eEOL_unknown)
 68 {
 69 }
 70 
 71 
 72 CStreamLineReader::~CStreamLineReader()
 73 {
 74 }
 75 
 76 
 77 bool CStreamLineReader::AtEOF(void) const
 78 {
 79     return !m_UngetLine &&
 80         (m_Stream->eof()  ||  CT_EQ_INT_TYPE(m_Stream->peek(), CT_EOF));
 81 }
 82 
 83 
 84 char CStreamLineReader::PeekChar(void) const
 85 {
 86     return m_UngetLine? *m_Line.begin(): m_Stream->peek();
 87 }
 88 
 89 
 90 void CStreamLineReader::UngetLine(void)
 91 {
 92     _ASSERT(!m_UngetLine);
 93     --m_LineNumber;
 94     m_UngetLine = true;
 95 }
 96 
 97 
 98 CStreamLineReader& CStreamLineReader::operator++(void)
 99 {
100     ++m_LineNumber;
101     if ( m_UngetLine ) {
102         m_UngetLine = false;
103         return *this;
104     }
105 
106     switch (m_EOLStyle) {
107     case eEOL_unknown: x_AdvanceEOLUnknown();                   break;
108     case eEOL_cr:      x_AdvanceEOLSimple('\r', '\n');          break;
109     case eEOL_lf:      x_AdvanceEOLSimple('\n', '\r');          break;
110     case eEOL_crlf:    x_AdvanceEOLCRLF();                      break;
111     case eEOL_mixed:   NcbiGetline(*m_Stream, m_Line, "\r\n");  break;
112     }
113     return *this;
114 }
115 
116 
117 CTempString CStreamLineReader::operator*(void) const
118 {
119     _ASSERT(!m_UngetLine);
120     return CTempString(m_Line);
121 }
122 
123 
124 CT_POS_TYPE CStreamLineReader::GetPosition(void) const
125 {
126     return m_Stream->tellg();
127 }
128 
129 
130 unsigned int CStreamLineReader::GetLineNumber(void) const
131 {
132     return m_LineNumber;
133 }
134 
135 
136 CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLUnknown(void)
137 {
138     _ASSERT(m_AutoEOL);
139     NcbiGetline(*m_Stream, m_Line, "\r\n");
140     m_Stream->unget();
141     CT_INT_TYPE eol = m_Stream->get();
142     if (CT_EQ_INT_TYPE(eol, CT_TO_INT_TYPE('\r'))) {
143         m_EOLStyle = eEOL_cr;
144     } else if (CT_EQ_INT_TYPE(eol, CT_TO_INT_TYPE('\n'))) {
145         // NcbiGetline doesn't yield enough information to determine
146         // whether eEOL_lf or eEOL_crlf is more appropriate, and not
147         // all streams allow tellg() (which could otherwise resolve
148         // matters), so defer further analysis to x_AdvanceEOLCRLF,
149         // which will be responsible for reading the next line and
150         // supports switching to eEOL_lf as appropriate.
151         //
152         // An alternative approach would have been to pass \r\n rather
153         // than \n\r, and then check for an immediately following \n
154         // if eol turned out to be \r, but that would miscount an
155         // actual(!) \r\n sequence as a single line break.
156         m_EOLStyle = eEOL_crlf;
157     }
158     return m_EOLStyle;
159 }
160 
161 
162 CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLSimple(char eol,
163                                                                    char alt_eol)
164 {
165     SIZE_TYPE pos;
166     NcbiGetline(*m_Stream, m_Line, eol);
167     if (m_AutoEOL  &&  (pos = m_Line.find(alt_eol)) != NPOS) {
168         ++pos;
169         if (eol != '\n'  ||  pos != m_Line.size()) {
170             // an *immediately* preceding CR is quite all right
171             CStreamUtils::Pushback(*m_Stream, m_Line.data() + pos,
172                                    m_Line.size() - pos);
173             m_EOLStyle = eEOL_mixed;
174         }
175         m_Line.resize(pos - 1);
176         return (m_EOLStyle == eEOL_mixed) ? m_EOLStyle : eEOL_crlf;
177     } else if (m_AutoEOL  &&  eol == '\r'  &&
178                CT_EQ_INT_TYPE(m_Stream->peek(), CT_TO_INT_TYPE(alt_eol))) {
179         m_Stream->get();
180         return eEOL_crlf;
181     }
182     return (eol == '\r') ? eEOL_cr : eEOL_lf;
183 }
184 
185 
186 CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLCRLF(void)
187 {
188     if (m_AutoEOL) {
189         EEOLStyle style = x_AdvanceEOLSimple('\n', '\r');
190         if (style == eEOL_mixed) {
191             // found an embedded CR
192             m_EOLStyle = eEOL_cr;
193         } else if (style != eEOL_crlf) {
194             m_EOLStyle = eEOL_lf;
195         }
196     } else {
197         string extra;
198         NcbiGetline(*m_Stream, m_Line, '\n');
199         while ( !AtEOF()  &&  !NStr::EndsWith(m_Line, "\r") ) {
200             m_Line += '\n';
201             NcbiGetline(*m_Stream, extra, '\n');
202             m_Line += extra;
203         }
204         if (NStr::EndsWith(m_Line, "\r")) {
205             m_Line.resize(m_Line.size() - 1);
206         }
207     }
208     return m_EOLStyle;
209 }
210 
211 
212 CMemoryLineReader::CMemoryLineReader(CMemoryFile* mem_file,
213                                      EOwnership ownership)
214     : m_Start(static_cast<char*>(mem_file->GetPtr())),
215       m_End(m_Start + mem_file->GetSize()),
216       m_Pos(m_Start),
217       m_MemFile(mem_file, ownership),
218       m_LineNumber(0)
219 {
220     m_MemFile->MemMapAdvise(CMemoryFile::eMMA_Sequential);
221 }
222 
223 
224 bool CMemoryLineReader::AtEOF(void) const
225 {
226     return m_Pos >= m_End;
227 }
228 
229 
230 char CMemoryLineReader::PeekChar(void) const
231 {
232     return *m_Pos;
233 }
234 
235 
236 void CMemoryLineReader::UngetLine(void)
237 {
238     _ASSERT(m_Line.begin());
239     _ASSERT(m_Pos != m_Line.begin());
240     --m_LineNumber;
241     m_Pos = m_Line.begin();
242 }
243 
244 
245 CMemoryLineReader& CMemoryLineReader::operator++(void)
246 {
247     const char* p = m_Pos;
248     if ( p == m_Line.begin() ) {
249         p = m_Line.end();
250     }
251     else {
252         while ( p < m_End  &&  *p != '\r'  && *p != '\n' ) {
253             ++p;
254         }
255         m_Line = CTempString(m_Pos, p - m_Pos);
256     }
257     // skip over delimiters
258     if (p + 1 < m_End  &&  *p == '\r'  &&  p[1] == '\n') {
259         m_Pos = p + 2;
260     } else if (p < m_End) {
261         m_Pos = p + 1;
262     } else { // no final line break
263         m_Pos = p;
264     }
265     ++m_LineNumber;
266     return *this;
267 }
268 
269 
270 CTempString CMemoryLineReader::operator*(void) const
271 {
272     _ASSERT(m_Line.begin());
273     return m_Line;
274 }
275 
276 
277 CT_POS_TYPE CMemoryLineReader::GetPosition(void) const
278 {
279     return NcbiInt8ToStreampos(m_Pos - m_Start);
280 }
281 
282 
283 unsigned int CMemoryLineReader::GetLineNumber(void) const
284 {
285     return m_LineNumber;
286 }
287 
288 
289 CBufferedLineReader::CBufferedLineReader(IReader* reader,
290                                          EOwnership ownership)
291     : m_Reader(reader, ownership),
292       m_Eof(false),
293       m_BufferSize(32*1024),
294       m_Buffer(new char[m_BufferSize]),
295       m_Pos(m_Buffer.get()),
296       m_End(m_Pos),
297       m_InputPos(0),
298       m_LineNumber(0)
299 {
300     x_ReadBuffer();
301 }
302 
303 
304 CBufferedLineReader::CBufferedLineReader(CNcbiIstream& is,
305                                          EOwnership ownership)
306     : m_Reader(new CStreamReader(is, ownership)),
307       m_Eof(false),
308       m_BufferSize(32*1024),
309       m_Buffer(new char[m_BufferSize]),
310       m_Pos(m_Buffer.get()),
311       m_End(m_Pos),
312       m_InputPos(0),
313       m_LineNumber(0)
314 {
315     x_ReadBuffer();
316 }
317 
318 
319 CBufferedLineReader::CBufferedLineReader(const string& filename)
320     : m_Reader(CFileReader::New(filename)),
321       m_Eof(false),
322       m_UngetLine(false),
323       m_BufferSize(32*1024),
324       m_Buffer(new char[m_BufferSize]),
325       m_Pos(m_Buffer.get()),
326       m_End(m_Pos),
327       m_InputPos(0),
328       m_LineNumber(0)
329 {
330     x_ReadBuffer();
331 }
332 
333 
334 CBufferedLineReader::~CBufferedLineReader()
335 {
336 }
337 
338 
339 bool CBufferedLineReader::AtEOF(void) const
340 {
341     return m_Eof && !m_UngetLine;
342 }
343 
344 
345 char CBufferedLineReader::PeekChar(void) const
346 {
347     return m_UngetLine? *m_Line.begin(): *m_Pos;
348 }
349 
350 
351 void CBufferedLineReader::UngetLine(void)
352 {
353     _ASSERT(!m_UngetLine);
354     _ASSERT(m_Line.begin());
355     --m_LineNumber;
356     m_UngetLine = true;
357 }
358 
359 
360 CBufferedLineReader& CBufferedLineReader::operator++(void)
361 {
362     ++m_LineNumber;
363     if ( m_UngetLine ) {
364         _ASSERT(m_Line.begin());
365         m_UngetLine = false;
366         return *this;
367     }
368     // check if we are at the buffer end
369     const char* start = m_Pos;
370     const char* end = m_End;
371     for ( const char* p = start; p < end; ++p ) {
372         if ( *p == '\n' ) {
373             m_Line = CTempString(start, p - start);
374             m_Pos = ++p;
375             if ( p == end ) {
376                 m_String = m_Line;
377                 m_Line = m_String;
378                 x_ReadBuffer();
379             }
380             return *this;
381         }
382         else if ( *p == '\r' ) {
383             m_Line = CTempString(start, p - start);
384             if ( ++p == end ) {
385                 m_String = m_Line;
386                 m_Line = m_String;
387                 if ( x_ReadBuffer() ) {
388                     p = m_Pos;
389                     if ( *p == '\n' ) {
390                         m_Pos = p+1;
391                     }
392                 }
393                 return *this;
394             }
395             if ( *p != '\n' ) {
396                 return *this;
397             }
398             m_Pos = ++p;
399             if ( p == end ) {
400                 m_String = m_Line;
401                 m_Line = m_String;
402                 x_ReadBuffer();
403             }
404             return *this;
405         }
406     }
407     x_LoadLong();
408     return *this;
409 }
410 
411 
412 void CBufferedLineReader::x_LoadLong(void)
413 {
414     const char* start = m_Pos;
415     const char* end = m_End;
416     m_String.assign(start, end);
417     while ( x_ReadBuffer() ) {
418         start = m_Pos;
419         end = m_End;
420         for ( const char* p = start; p < end; ++p ) {
421             char c = *p;
422             if ( c == '\r' || c == '\n' ) {
423                 m_String.append(start, p - start);
424                 m_Line = m_String;
425                 if ( ++p == end ) {
426                     m_String = m_Line;
427                     m_Line = m_String;
428                     if ( x_ReadBuffer() ) {
429                         p = m_Pos;
430                         end = m_End;
431                         if ( p < end && c == '\r' && *p == '\n' ) {
432                             ++p;
433                             m_Pos = p;
434                         }
435                     }
436                 }
437                 else {
438                     if ( c == '\r' && *p == '\n' ) {
439                         if ( ++p == end ) {
440                             x_ReadBuffer();
441                             p = m_Pos;
442                         }
443                     }
444                     m_Pos = p;
445                 }
446                 return;
447             }
448         }
449         m_String.append(start, end - start);
450     }
451     m_Line = m_String;
452     return;
453 }
454 
455 
456 bool CBufferedLineReader::x_ReadBuffer()
457 {
458     _ASSERT(m_Reader);
459 
460     if ( m_Eof ) {
461         return false;
462     }
463 
464     m_InputPos += CT_OFF_TYPE(m_End - m_Buffer.get());
465     m_Pos = m_End = m_Buffer.get();
466     for (bool flag = true; flag; ) {
467         size_t size;
468         ERW_Result result =
469             m_Reader->Read(m_Buffer.get(), m_BufferSize, &size);
470         switch (result) {
471         case eRW_NotImplemented:
472         case eRW_Error:
473             NCBI_THROW(CIOException, eRead, "Read error");
474             /*NOTREACHED*/
475             break;
476         case eRW_Timeout:
477             // keep spinning around
478             break;
479         case eRW_Eof:
480             m_Eof = true;
481             // fall through
482         case eRW_Success:
483             m_End = m_Pos + size;
484             return (result == eRW_Success  ||  size > 0);
485         default:
486             _ASSERT(0);
487         }
488     } // for
489     return false;
490 }
491 
492 
493 CTempString CBufferedLineReader::operator*(void) const
494 {
495     return m_Line;
496 }
497 
498 
499 CT_POS_TYPE CBufferedLineReader::GetPosition(void) const
500 {
501     return m_InputPos + CT_OFF_TYPE(m_Pos - m_Buffer.get());
502 }
503 
504 
505 unsigned int CBufferedLineReader::GetLineNumber(void) const
506 {
507     return m_LineNumber;
508 }
509 
510 
511 END_NCBI_SCOPE
512 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.