|
NCBI Home IEB Home C++ Toolkit docs C Toolkit source browser C Toolkit source browser (2) |
NCBI C++ Toolkit Cross ReferenceC++/src/util/line_reader.cpp |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: line_reader.cpp 162426 2009-06-05 15:13:09Z ucko $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko, Anatoliy Kuznetsov
27 *
28 * File Description:
29 * Lightweight interface for getting lines of data with minimal
30 * memory copying.
31 *
32 * ===========================================================================
33 */
34
35 #include <ncbi_pch.hpp>
36 #include <util/line_reader.hpp>
37 #include <util/util_exception.hpp>
38 #include <corelib/rwstream.hpp>
39 #include <corelib/ncbifile.hpp>
40 #include <corelib/stream_utils.hpp>
41
42 #include <string.h>
43
44 BEGIN_NCBI_SCOPE
45
46
47 CRef<ILineReader> ILineReader::New(const string& filename)
48 {
49 CRef<ILineReader> lr;
50 lr.Reset(new CBufferedLineReader(filename));
51 return lr;
52 }
53
54
55 CStreamLineReader::CStreamLineReader(CNcbiIstream& is,
56 EEOLStyle eol_style,
57 EOwnership ownership)
58 : m_Stream(&is, ownership), m_LineNumber(0), m_UngetLine(false),
59 m_AutoEOL(eol_style == eEOL_unknown), m_EOLStyle(eol_style)
60 {
61 }
62
63
64 CStreamLineReader::CStreamLineReader(CNcbiIstream& is,
65 EOwnership ownership)
66 : m_Stream(&is, ownership), m_LineNumber(0), m_UngetLine(false),
67 m_AutoEOL(true), m_EOLStyle(eEOL_unknown)
68 {
69 }
70
71
72 CStreamLineReader::~CStreamLineReader()
73 {
74 }
75
76
77 bool CStreamLineReader::AtEOF(void) const
78 {
79 return !m_UngetLine &&
80 (m_Stream->eof() || CT_EQ_INT_TYPE(m_Stream->peek(), CT_EOF));
81 }
82
83
84 char CStreamLineReader::PeekChar(void) const
85 {
86 return m_UngetLine? *m_Line.begin(): m_Stream->peek();
87 }
88
89
90 void CStreamLineReader::UngetLine(void)
91 {
92 _ASSERT(!m_UngetLine);
93 --m_LineNumber;
94 m_UngetLine = true;
95 }
96
97
98 CStreamLineReader& CStreamLineReader::operator++(void)
99 {
100 ++m_LineNumber;
101 if ( m_UngetLine ) {
102 m_UngetLine = false;
103 return *this;
104 }
105
106 switch (m_EOLStyle) {
107 case eEOL_unknown: x_AdvanceEOLUnknown(); break;
108 case eEOL_cr: x_AdvanceEOLSimple('\r', '\n'); break;
109 case eEOL_lf: x_AdvanceEOLSimple('\n', '\r'); break;
110 case eEOL_crlf: x_AdvanceEOLCRLF(); break;
111 case eEOL_mixed: NcbiGetline(*m_Stream, m_Line, "\r\n"); break;
112 }
113 return *this;
114 }
115
116
117 CTempString CStreamLineReader::operator*(void) const
118 {
119 _ASSERT(!m_UngetLine);
120 return CTempString(m_Line);
121 }
122
123
124 CT_POS_TYPE CStreamLineReader::GetPosition(void) const
125 {
126 return m_Stream->tellg();
127 }
128
129
130 unsigned int CStreamLineReader::GetLineNumber(void) const
131 {
132 return m_LineNumber;
133 }
134
135
136 CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLUnknown(void)
137 {
138 _ASSERT(m_AutoEOL);
139 NcbiGetline(*m_Stream, m_Line, "\r\n");
140 m_Stream->unget();
141 CT_INT_TYPE eol = m_Stream->get();
142 if (CT_EQ_INT_TYPE(eol, CT_TO_INT_TYPE('\r'))) {
143 m_EOLStyle = eEOL_cr;
144 } else if (CT_EQ_INT_TYPE(eol, CT_TO_INT_TYPE('\n'))) {
145 // NcbiGetline doesn't yield enough information to determine
146 // whether eEOL_lf or eEOL_crlf is more appropriate, and not
147 // all streams allow tellg() (which could otherwise resolve
148 // matters), so defer further analysis to x_AdvanceEOLCRLF,
149 // which will be responsible for reading the next line and
150 // supports switching to eEOL_lf as appropriate.
151 //
152 // An alternative approach would have been to pass \r\n rather
153 // than \n\r, and then check for an immediately following \n
154 // if eol turned out to be \r, but that would miscount an
155 // actual(!) \r\n sequence as a single line break.
156 m_EOLStyle = eEOL_crlf;
157 }
158 return m_EOLStyle;
159 }
160
161
162 CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLSimple(char eol,
163 char alt_eol)
164 {
165 SIZE_TYPE pos;
166 NcbiGetline(*m_Stream, m_Line, eol);
167 if (m_AutoEOL && (pos = m_Line.find(alt_eol)) != NPOS) {
168 ++pos;
169 if (eol != '\n' || pos != m_Line.size()) {
170 // an *immediately* preceding CR is quite all right
171 CStreamUtils::Pushback(*m_Stream, m_Line.data() + pos,
172 m_Line.size() - pos);
173 m_EOLStyle = eEOL_mixed;
174 }
175 m_Line.resize(pos - 1);
176 return (m_EOLStyle == eEOL_mixed) ? m_EOLStyle : eEOL_crlf;
177 } else if (m_AutoEOL && eol == '\r' &&
178 CT_EQ_INT_TYPE(m_Stream->peek(), CT_TO_INT_TYPE(alt_eol))) {
179 m_Stream->get();
180 return eEOL_crlf;
181 }
182 return (eol == '\r') ? eEOL_cr : eEOL_lf;
183 }
184
185
186 CStreamLineReader::EEOLStyle CStreamLineReader::x_AdvanceEOLCRLF(void)
187 {
188 if (m_AutoEOL) {
189 EEOLStyle style = x_AdvanceEOLSimple('\n', '\r');
190 if (style == eEOL_mixed) {
191 // found an embedded CR
192 m_EOLStyle = eEOL_cr;
193 } else if (style != eEOL_crlf) {
194 m_EOLStyle = eEOL_lf;
195 }
196 } else {
197 string extra;
198 NcbiGetline(*m_Stream, m_Line, '\n');
199 while ( !AtEOF() && !NStr::EndsWith(m_Line, "\r") ) {
200 m_Line += '\n';
201 NcbiGetline(*m_Stream, extra, '\n');
202 m_Line += extra;
203 }
204 if (NStr::EndsWith(m_Line, "\r")) {
205 m_Line.resize(m_Line.size() - 1);
206 }
207 }
208 return m_EOLStyle;
209 }
210
211
212 CMemoryLineReader::CMemoryLineReader(CMemoryFile* mem_file,
213 EOwnership ownership)
214 : m_Start(static_cast<char*>(mem_file->GetPtr())),
215 m_End(m_Start + mem_file->GetSize()),
216 m_Pos(m_Start),
217 m_MemFile(mem_file, ownership),
218 m_LineNumber(0)
219 {
220 m_MemFile->MemMapAdvise(CMemoryFile::eMMA_Sequential);
221 }
222
223
224 bool CMemoryLineReader::AtEOF(void) const
225 {
226 return m_Pos >= m_End;
227 }
228
229
230 char CMemoryLineReader::PeekChar(void) const
231 {
232 return *m_Pos;
233 }
234
235
236 void CMemoryLineReader::UngetLine(void)
237 {
238 _ASSERT(m_Line.begin());
239 _ASSERT(m_Pos != m_Line.begin());
240 --m_LineNumber;
241 m_Pos = m_Line.begin();
242 }
243
244
245 CMemoryLineReader& CMemoryLineReader::operator++(void)
246 {
247 const char* p = m_Pos;
248 if ( p == m_Line.begin() ) {
249 p = m_Line.end();
250 }
251 else {
252 while ( p < m_End && *p != '\r' && *p != '\n' ) {
253 ++p;
254 }
255 m_Line = CTempString(m_Pos, p - m_Pos);
256 }
257 // skip over delimiters
258 if (p + 1 < m_End && *p == '\r' && p[1] == '\n') {
259 m_Pos = p + 2;
260 } else if (p < m_End) {
261 m_Pos = p + 1;
262 } else { // no final line break
263 m_Pos = p;
264 }
265 ++m_LineNumber;
266 return *this;
267 }
268
269
270 CTempString CMemoryLineReader::operator*(void) const
271 {
272 _ASSERT(m_Line.begin());
273 return m_Line;
274 }
275
276
277 CT_POS_TYPE CMemoryLineReader::GetPosition(void) const
278 {
279 return NcbiInt8ToStreampos(m_Pos - m_Start);
280 }
281
282
283 unsigned int CMemoryLineReader::GetLineNumber(void) const
284 {
285 return m_LineNumber;
286 }
287
288
289 CBufferedLineReader::CBufferedLineReader(IReader* reader,
290 EOwnership ownership)
291 : m_Reader(reader, ownership),
292 m_Eof(false),
293 m_BufferSize(32*1024),
294 m_Buffer(new char[m_BufferSize]),
295 m_Pos(m_Buffer.get()),
296 m_End(m_Pos),
297 m_InputPos(0),
298 m_LineNumber(0)
299 {
300 x_ReadBuffer();
301 }
302
303
304 CBufferedLineReader::CBufferedLineReader(CNcbiIstream& is,
305 EOwnership ownership)
306 : m_Reader(new CStreamReader(is, ownership)),
307 m_Eof(false),
308 m_BufferSize(32*1024),
309 m_Buffer(new char[m_BufferSize]),
310 m_Pos(m_Buffer.get()),
311 m_End(m_Pos),
312 m_InputPos(0),
313 m_LineNumber(0)
314 {
315 x_ReadBuffer();
316 }
317
318
319 CBufferedLineReader::CBufferedLineReader(const string& filename)
320 : m_Reader(CFileReader::New(filename)),
321 m_Eof(false),
322 m_UngetLine(false),
323 m_BufferSize(32*1024),
324 m_Buffer(new char[m_BufferSize]),
325 m_Pos(m_Buffer.get()),
326 m_End(m_Pos),
327 m_InputPos(0),
328 m_LineNumber(0)
329 {
330 x_ReadBuffer();
331 }
332
333
334 CBufferedLineReader::~CBufferedLineReader()
335 {
336 }
337
338
339 bool CBufferedLineReader::AtEOF(void) const
340 {
341 return m_Eof && !m_UngetLine;
342 }
343
344
345 char CBufferedLineReader::PeekChar(void) const
346 {
347 return m_UngetLine? *m_Line.begin(): *m_Pos;
348 }
349
350
351 void CBufferedLineReader::UngetLine(void)
352 {
353 _ASSERT(!m_UngetLine);
354 _ASSERT(m_Line.begin());
355 --m_LineNumber;
356 m_UngetLine = true;
357 }
358
359
360 CBufferedLineReader& CBufferedLineReader::operator++(void)
361 {
362 ++m_LineNumber;
363 if ( m_UngetLine ) {
364 _ASSERT(m_Line.begin());
365 m_UngetLine = false;
366 return *this;
367 }
368 // check if we are at the buffer end
369 const char* start = m_Pos;
370 const char* end = m_End;
371 for ( const char* p = start; p < end; ++p ) {
372 if ( *p == '\n' ) {
373 m_Line = CTempString(start, p - start);
374 m_Pos = ++p;
375 if ( p == end ) {
376 m_String = m_Line;
377 m_Line = m_String;
378 x_ReadBuffer();
379 }
380 return *this;
381 }
382 else if ( *p == '\r' ) {
383 m_Line = CTempString(start, p - start);
384 if ( ++p == end ) {
385 m_String = m_Line;
386 m_Line = m_String;
387 if ( x_ReadBuffer() ) {
388 p = m_Pos;
389 if ( *p == '\n' ) {
390 m_Pos = p+1;
391 }
392 }
393 return *this;
394 }
395 if ( *p != '\n' ) {
396 return *this;
397 }
398 m_Pos = ++p;
399 if ( p == end ) {
400 m_String = m_Line;
401 m_Line = m_String;
402 x_ReadBuffer();
403 }
404 return *this;
405 }
406 }
407 x_LoadLong();
408 return *this;
409 }
410
411
412 void CBufferedLineReader::x_LoadLong(void)
413 {
414 const char* start = m_Pos;
415 const char* end = m_End;
416 m_String.assign(start, end);
417 while ( x_ReadBuffer() ) {
418 start = m_Pos;
419 end = m_End;
420 for ( const char* p = start; p < end; ++p ) {
421 char c = *p;
422 if ( c == '\r' || c == '\n' ) {
423 m_String.append(start, p - start);
424 m_Line = m_String;
425 if ( ++p == end ) {
426 m_String = m_Line;
427 m_Line = m_String;
428 if ( x_ReadBuffer() ) {
429 p = m_Pos;
430 end = m_End;
431 if ( p < end && c == '\r' && *p == '\n' ) {
432 ++p;
433 m_Pos = p;
434 }
435 }
436 }
437 else {
438 if ( c == '\r' && *p == '\n' ) {
439 if ( ++p == end ) {
440 x_ReadBuffer();
441 p = m_Pos;
442 }
443 }
444 m_Pos = p;
445 }
446 return;
447 }
448 }
449 m_String.append(start, end - start);
450 }
451 m_Line = m_String;
452 return;
453 }
454
455
456 bool CBufferedLineReader::x_ReadBuffer()
457 {
458 _ASSERT(m_Reader);
459
460 if ( m_Eof ) {
461 return false;
462 }
463
464 m_InputPos += CT_OFF_TYPE(m_End - m_Buffer.get());
465 m_Pos = m_End = m_Buffer.get();
466 for (bool flag = true; flag; ) {
467 size_t size;
468 ERW_Result result =
469 m_Reader->Read(m_Buffer.get(), m_BufferSize, &size);
470 switch (result) {
471 case eRW_NotImplemented:
472 case eRW_Error:
473 NCBI_THROW(CIOException, eRead, "Read error");
474 /*NOTREACHED*/
475 break;
476 case eRW_Timeout:
477 // keep spinning around
478 break;
479 case eRW_Eof:
480 m_Eof = true;
481 // fall through
482 case eRW_Success:
483 m_End = m_Pos + size;
484 return (result == eRW_Success || size > 0);
485 default:
486 _ASSERT(0);
487 }
488 } // for
489 return false;
490 }
491
492
493 CTempString CBufferedLineReader::operator*(void) const
494 {
495 return m_Line;
496 }
497
498
499 CT_POS_TYPE CBufferedLineReader::GetPosition(void) const
500 {
501 return m_InputPos + CT_OFF_TYPE(m_Pos - m_Buffer.get());
502 }
503
504
505 unsigned int CBufferedLineReader::GetLineNumber(void) const
506 {
507 return m_LineNumber;
508 }
509
510
511 END_NCBI_SCOPE
512 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |