NCBI C++ ToolKit
ncbistr.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: ncbistr.cpp 77906 2017-05-15 13:28:41Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko, Denis Vakatov
27  *
28  * File Description:
29  * Some helper functions
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/tempstr.hpp>
37 #include <corelib/ncbistr_util.hpp>
38 #include <corelib/error_codes.hpp>
39 #include <corelib/ncbierror.hpp>
40 #include <corelib/ncbifloat.h>
41 #include <memory>
42 #include <functional>
43 #include <algorithm>
44 #include <iterator>
45 #include <stdio.h>
46 #include <locale.h>
47 #include <math.h>
48 
49 
50 #define NCBI_USE_ERRCODE_X Corelib_Util
51 
52 
54 
55 
56 // Digits (up to base 36)
57 static const char kDigit[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
58 
59 
60 static inline
61 SIZE_TYPE s_DiffPtr(const char* end, const char* start)
62 {
63  return end ? (SIZE_TYPE)(end - start) : (SIZE_TYPE) 0;
64 }
65 
66 const char *const kEmptyCStr = "";
67 
68 #if defined(HAVE_WSTRING)
69 const wchar_t *const kEmptyWCStr = L"";
70 #endif
71 
72 
73 extern const char* const kNcbiDevelopmentVersionString;
74 const char* const kNcbiDevelopmentVersionString
75  = "NCBI_DEVELOPMENT_VER_" NCBI_AS_STRING(NCBI_DEVELOPMENT_VER);
76 
77 #ifdef NCBI_PRODUCTION_VER
78 extern const char* const kNcbiProductionVersionString;
79 const char* const kNcbiProductionVersionString
80  = "NCBI_PRODUCTION_VER_" NCBI_AS_STRING(NCBI_PRODUCTION_VER);
81 #endif
82 
83 
84 #if !defined(NCBI_OS_MSWIN) && !( defined(NCBI_OS_LINUX) && defined(NCBI_COMPILER_GCC) )
85 const string* CNcbiEmptyString::m_Str = 0;
86 const string& CNcbiEmptyString::FirstGet(void) {
87  static const string s_Str = "";
88  m_Str = &s_Str;
89  return s_Str;
90 }
91 # ifdef HAVE_WSTRING
92 const wstring* CNcbiEmptyWString::m_Str = 0;
93 const wstring& CNcbiEmptyWString::FirstGet(void) {
94  static const wstring s_Str = L"";
95  m_Str = &s_Str;
96  return s_Str;
97 }
98 # endif
99 #endif
100 
101 
102 bool NStr::IsBlank(const CTempString str, SIZE_TYPE pos)
103 {
104  SIZE_TYPE len = str.length();
105  for (SIZE_TYPE idx = pos; idx < len; ++idx) {
106  if (!isspace((unsigned char) str[idx])) {
107  return false;
108  }
109  }
110  return true;
111 }
112 
113 
115  const char* pattern)
116 {
117  if (pos == NPOS || !n || str.length() <= pos) {
118  return *pattern ? -1 : 0;
119  }
120  if ( !*pattern ) {
121  return 1;
122  }
123  if (n == NPOS || n > str.length() - pos) {
124  n = str.length() - pos;
125  }
126  const char* s = str.data() + pos;
127  while (n && *pattern && *s == *pattern) {
128  s++; pattern++; n--;
129  }
130  if (n == 0) {
131  return *pattern ? -1 : 0;
132  }
133  return *s - *pattern;
134 }
135 
136 
137 
139  const CTempString pattern)
140 {
141  if (pos == NPOS || !n || str.length() <= pos) {
142  return pattern.empty() ? 0 : -1;
143  }
144  if (pattern.empty()) {
145  return 1;
146  }
147  if (n == NPOS || n > str.length() - pos) {
148  n = str.length() - pos;
149  }
150  SIZE_TYPE n_cmp = n;
151  if (n_cmp > pattern.length()) {
152  n_cmp = pattern.length();
153  }
154  const char* s = str.data() + pos;
155  const char* p = pattern.data();
156  while (n_cmp && *s == *p) {
157  s++; p++; n_cmp--;
158  }
159 
160  if (n_cmp == 0) {
161  if (n == pattern.length())
162  return 0;
163  return n > pattern.length() ? 1 : -1;
164  }
165 
166  return *s - *p;
167 }
168 
169 
171  const char* pattern)
172 {
173  if (pos == NPOS || !n || str.length() <= pos) {
174  return *pattern ? -1 : 0;
175  }
176  if ( !*pattern ) {
177  return 1;
178  }
179 
180  if (n == NPOS || n > str.length() - pos) {
181  n = str.length() - pos;
182  }
183 
184  const char* s = str.data() + pos;
185  while (n && *pattern &&
186  tolower((unsigned char)(*s)) ==
187  tolower((unsigned char)(*pattern))) {
188  s++; pattern++; n--;
189  }
190 
191  if (n == 0) {
192  return *pattern ? -1 : 0;
193  }
194 
195  return tolower((unsigned char)(*s)) - tolower((unsigned char)(*pattern));
196 }
197 
198 
200  const CTempString pattern)
201 {
202  if (pos == NPOS || !n || str.length() <= pos) {
203  return pattern.empty() ? 0 : -1;
204  }
205  if (pattern.empty()) {
206  return 1;
207  }
208 
209  if (n == NPOS || n > str.length() - pos) {
210  n = str.length() - pos;
211  }
212 
213  SIZE_TYPE n_cmp = n;
214  if (n_cmp > pattern.length()) {
215  n_cmp = pattern.length();
216  }
217  const char* s = str.data() + pos;
218  const char* p = pattern.data();
219  while (n_cmp &&
220  tolower((unsigned char)(*s)) == tolower((unsigned char)(*p))) {
221  s++; p++; n_cmp--;
222  }
223 
224  if (n_cmp == 0) {
225  if (n == pattern.length())
226  return 0;
227  return n > pattern.length() ? 1 : -1;
228  }
229 
230  return tolower((unsigned char)(*s)) - tolower((unsigned char)(*p));
231 }
232 
233 
234 // NOTE: This code is used also in the CDirEntry::MatchesMask.
236 {
237  char c;
238  for ( size_t str_pos = 0, mask_pos = 0; ; ) {
239  // Analyze symbol in mask
240  switch ( c = mask[mask_pos++] ) {
241  case '\0':
242  return str[str_pos] == '\0';
243 
244  case '?':
245  if (str[str_pos] == '\0') {
246  return false;
247  }
248  ++str_pos;
249  break;
250 
251  case '*':
252  c = mask[mask_pos];
253  // Collapse multiple stars
254  while ( c == '*' ) {
255  c = mask[++mask_pos];
256  }
257  if (c == '\0') {
258  return true;
259  }
260  // General case, use recursion
261  while ( str[str_pos] ) {
262  if ( MatchesMask(str.substr(str_pos),
263  mask.substr(mask_pos),
264  use_case) ) {
265  return true;
266  }
267  ++str_pos;
268  }
269  return false;
270 
271  default:
272  // Compare non pattern character in mask and name
273  char s = str[str_pos++];
274  if (use_case == NStr::eNocase) {
275  c = (char)tolower((unsigned char) c);
276  s = (char)tolower((unsigned char) s);
277  }
278  if (c != s) {
279  return false;
280  }
281  break;
282  }
283  }
284  return false;
285 }
286 
287 
288 char* NStr::ToLower(char* str)
289 {
290  char* s;
291  for (s = str; *str; str++) {
292  *str = (char)tolower((unsigned char)(*str));
293  }
294  return s;
295 }
296 
297 
298 string& NStr::ToLower(string& str)
299 {
300  NON_CONST_ITERATE (string, it, str) {
301  *it = (char)tolower((unsigned char)(*it));
302  }
303  return str;
304 }
305 
306 
307 char* NStr::ToUpper(char* str)
308 {
309  char* s;
310  for (s = str; *str; str++) {
311  *str = (char)toupper((unsigned char)(*str));
312  }
313  return s;
314 }
315 
316 
317 string& NStr::ToUpper(string& str)
318 {
319  NON_CONST_ITERATE (string, it, str) {
320  *it = (char)toupper((unsigned char)(*it));
321  }
322  return str;
323 }
324 
325 
326 bool NStr::IsLower(const CTempString str)
327 {
328  SIZE_TYPE len = str.length();
329  for (SIZE_TYPE i = 0; i < len; ++i) {
330  if (isalpha((unsigned char)str[i]) && !islower((unsigned char)str[i])) {
331  return false;
332  }
333  }
334  return true;
335 }
336 
337 
338 bool NStr::IsUpper(const CTempString str)
339 {
340  SIZE_TYPE len = str.length();
341  for (SIZE_TYPE i = 0; i < len; ++i) {
342  if (isalpha((unsigned char)str[i]) && !isupper((unsigned char)str[i])) {
343  return false;
344  }
345  }
346  return true;
347 }
348 
349 
351 {
352  int error = 0, ret = -1;
353  size_t len = str.size();
354 
355  if (!len) {
356  error = EINVAL;
357  } else {
358  size_t i = 0;
359  // skip leading '+' if any
360  if (str.data()[0] == '+' && len > 1) {
361  ++i;
362  }
363  unsigned v = 0;
364  for (; i < len; ++i) {
365  unsigned d = str.data()[i] - '0';
366  if (d > 9) {
367  error = EINVAL;
368  break;
369  }
370  unsigned nv = v * 10 + d;
371  const unsigned kOverflowLimit = (INT_MAX - 9) / 10 + 1;
372  if (v >= kOverflowLimit) {
373  // possible overflow
374  if (v > kOverflowLimit || nv > INT_MAX) {
375  error = ERANGE;
376  break;
377  }
378  }
379  v = nv;
380  }
381  if (!error) {
382  ret = static_cast<int>(v);
383  }
384  }
385 /*
386  if (flags & fConvErr_NoErrno) {
387  return ret;
388  }
389 */
390  errno = error;
391  if (error) {
392  if (flags & fConvErr_NoErrMessage) {
393  CNcbiError::SetErrno(error);
394  } else {
395  CNcbiError::SetErrno(error, str);
396  }
397  }
398  return ret;
399 }
400 
401 
402 /// @internal
403 // Access to errno is slow on some platforms, because it use TLS to store a value
404 // for each thread. This guard class can set an errno value in string to numeric
405 // conversion functions only once before exit, and when necessary.
407 {
408 public:
410  m_NoErrno(false), // m_NoErrno((flags & NStr::fConvErr_NoErrno) > 0),
411  m_NoThrow((flags & NStr::fConvErr_NoThrow) > 0),
412  m_SkipIfZero(skip_if_zero),
413  m_Errno(0)
414  {}
415  ~CS2N_Guard(void) {
416  if (!m_NoErrno) {
417  // Does the guard is used against the code that already set an errno?
418  // If the error code is not defined here, do not even try to check/set it.
419  if (!m_SkipIfZero || m_Errno) {
420  errno = m_Errno;
421  }
422  }
423  }
424  void Set(int errcode) { m_Errno = errcode; }
425  int Errno(void) const { return m_Errno;}
426  // Says that we want to throw an exception, do not set errno in this case
427  void Throw(void) { m_SkipIfZero = true; m_Errno = 0; }
428  bool ToThrow(void) const { return !m_NoThrow; }
429  // Auxiliary function to create a message about conversion error
430  // to specified type. It doesn't have any relation to the guard itself,
431  // but can help to save on the amount of code in calling macro.
432  string Message(const CTempString str, const char* to_type, const CTempString msg);
433 
434 private:
435  bool m_NoErrno; // do not set errno at all
436  bool m_NoThrow; // do not throw an exception if TRUE
437  bool m_SkipIfZero; // do not set errno if TRUE and m_Errno == 0
438  int m_Errno; // errno value to set
439 };
440 
441 string CS2N_Guard::Message(const CTempString str, const char* to_type, const CTempString msg)
442 {
443  string s;
444  s.reserve(str.length() + msg.length() + 50);
445  s += "Cannot convert string '";
446  s += str;
447  s += "' to ";
448  s += to_type;
449  if ( !msg.empty() ) {
450  s += ", ";
451  s += msg;
452  }
453  return s;
454 }
455 
456 /// Regular guard
457 #define S2N_CONVERT_GUARD(flags) \
458  CS2N_Guard err_guard(flags, false)
459 
460 // This guard can be used against the code that already set an errno.
461 // If the error code is not defined, the guard not even try to check/set it (even to zero).
462 #define S2N_CONVERT_GUARD_EX(flags) \
463  CS2N_Guard err_guard(flags, true)
464 
465 #define S2N_CONVERT_ERROR(to_type, msg, errcode, pos) \
466  do { \
467  err_guard.Set(errcode); \
468  if ( err_guard.ToThrow() ) { \
469  err_guard.Throw(); \
470  NCBI_THROW2(CStringException, eConvert, \
471  err_guard.Message(str, #to_type, msg), pos); \
472  } else { \
473 /* \
474  if (flags & NStr::fConvErr_NoErrno) { \
475  / Error, but forced to return 0 / \
476  return 0; \
477  } \
478 */ \
479  if (flags & NStr::fConvErr_NoErrMessage) { \
480  CNcbiError::SetErrno(err_guard.Errno()); \
481  } else { \
482  CNcbiError::SetErrno(err_guard.Errno(), \
483  err_guard.Message(str, #to_type, msg)); \
484  } \
485  return 0; \
486  } \
487  } while (false)
488 
489 
490 #define S2N_CONVERT_ERROR_INVAL(to_type) \
491  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos)
492 
493 #define S2N_CONVERT_ERROR_RADIX(to_type, msg) \
494  S2N_CONVERT_ERROR(to_type, msg, EINVAL, pos)
495 
496 #define S2N_CONVERT_ERROR_OVERFLOW(to_type) \
497  S2N_CONVERT_ERROR(to_type, "overflow",ERANGE, pos)
498 
499 #define CHECK_ENDPTR(to_type) \
500  if ( str[pos] ) { \
501  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos); \
502  }
503 
504 #define CHECK_ENDPTR_SIZE(to_type) \
505  if ( pos < size ) { \
506  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos); \
507  }
508 
509 #define CHECK_COMMAS \
510  /* Check on possible commas */ \
511  if (flags & NStr::fAllowCommas) { \
512  if (ch == ',') { \
513  if ((numpos == pos) || \
514  ((comma >= 0) && (comma != 3)) ) { \
515  /* Not first comma, sitting on incorrect place */ \
516  break; \
517  } \
518  /* Skip it */ \
519  comma = 0; \
520  pos++; \
521  continue; \
522  } else { \
523  if (comma >= 0) { \
524  /* Count symbols between commas */ \
525  comma++; \
526  } \
527  } \
528  }
529 
530 
532 {
533  S2N_CONVERT_GUARD_EX(flags);
534  Int8 value = StringToInt8(str, flags, base);
535  if ( value < kMin_Int || value > kMax_Int) {
536  S2N_CONVERT_ERROR(int, "overflow", ERANGE, 0);
537  }
538  return (int) value;
539 }
540 
541 
542 unsigned int
544 {
545  S2N_CONVERT_GUARD_EX(flags);
546  Uint8 value = StringToUInt8(str, flags, base);
547  if ( value > kMax_UInt ) {
548  S2N_CONVERT_ERROR(unsigned int, "overflow", ERANGE, 0);
549  }
550  return (unsigned int) value;
551 }
552 
553 
555 {
556  S2N_CONVERT_GUARD_EX(flags);
557  Int8 value = StringToInt8(str, flags, base);
558  if ( value < kMin_Long || value > kMax_Long) {
559  S2N_CONVERT_ERROR(long, "overflow", ERANGE, 0);
560  }
561  return (long) value;
562 }
563 
564 
565 unsigned long
567 {
568  S2N_CONVERT_GUARD_EX(flags);
569  Uint8 value = StringToUInt8(str, flags, base);
570  if ( value > kMax_ULong ) {
571  S2N_CONVERT_ERROR(unsigned long, "overflow", ERANGE, 0);
572  }
573  return (unsigned long) value;
574 }
575 
576 
577 /// @internal
578 // Check that symbol 'ch' is good symbol for number with radix 'base'.
579 static inline
580 bool s_IsGoodCharForRadix(char ch, int base, int* value = 0)
581 {
582  if ( base <= 10 ) {
583  // shortcut for most frequent case
584  int delta = ch-'0';
585  if ( unsigned(delta) < unsigned(base) ) {
586  if ( value ) {
587  *value = delta;
588  }
589  return true;
590  }
591  return false;
592  }
593  if (!isalnum((unsigned char) ch)) {
594  return false;
595  }
596  // Corresponding numeric value of *endptr
597  int delta;
598  if (isdigit((unsigned char) ch)) {
599  delta = ch - '0';
600  } else {
601  ch = (char)tolower((unsigned char) ch);
602  delta = ch - 'a' + 10;
603  }
604  if ( value ) {
605  *value = delta;
606  }
607  return delta < base;
608  }
609 
610 
611 // Skip all allowed chars (all except used for digit composition).
612 // Update 'ptr' to current position in the string.
613 enum ESkipMode {
614  eSkipAll, // all symbols
615  eSkipAllAllowed, // all symbols, except digit/+/-/.
616  eSkipSpacesOnly // spaces only
617 };
618 
619 static inline
621 {
622  if ( ch != '.' && ch != ',') {
623  return false;
624  }
625  if (flags & NStr::fDecimalPosix) {
626  return ch == '.';
627  }
628  else if (flags & NStr::fDecimalPosixOrLocal) {
629  return ch == '.' || ch == ',';
630  }
631  struct lconv* conv = localeconv();
632  return ch == *(conv->decimal_point);
633 }
634 
635 static inline
637  SIZE_TYPE& pos,
638  ESkipMode skip_mode,
640 {
641  if (skip_mode == eSkipAll) {
642  pos = str.length();
643  return;
644  }
645 
646  for ( SIZE_TYPE len = str.length(); pos < len; ++pos ) {
647  unsigned char ch = str[pos];
648  if ( isdigit(ch) || ch == '+' || ch == '-' || s_IsDecimalPoint(ch,flags) ) {
649  break;
650  }
651  if ( (skip_mode == eSkipSpacesOnly) && !isspace(ch) ) {
652  break;
653  }
654  }
655 }
656 
657 
658 // Check radix base. If it is zero, determine base using first chars
659 // of the string. Update 'base' value.
660 // Update 'ptr' to current position in the string.
661 static inline
662 bool s_CheckRadix(const CTempString str, SIZE_TYPE& pos, int& base)
663 {
664  if ( base == 10 || base == 8 ) {
665  // shortcut for most frequent case
666  return true;
667  }
668  // Check base
669  if ( base < 0 || base == 1 || base > 36 ) {
670  return false;
671  }
672  // Try to determine base using first chars of the string
673  unsigned char ch = str[pos];
674  unsigned char next = str[pos+1];
675  if ( base == 0 ) {
676  if ( ch != '0' ) {
677  base = 10;
678  } else if (next == 'x' || next == 'X') {
679  base = 16;
680  } else {
681  base = 8;
682  }
683  }
684  // Remove leading '0x' for hex numbers
685  if ( base == 16 ) {
686  if (ch == '0' && (next == 'x' || next == 'X')) {
687  pos += 2;
688  }
689  }
690  return true;
691 }
692 
693 
695 {
696  S2N_CONVERT_GUARD(flags);
697 
698  // Current position in the string
699  SIZE_TYPE pos = 0;
700 
701  // Skip allowed leading symbols
702  if (flags & fAllowLeadingSymbols) {
703  bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
704  s_SkipAllowedSymbols(str, pos,
705  spaces ? eSkipSpacesOnly : eSkipAllAllowed, flags);
706  }
707  // Determine sign
708  bool sign = false;
709  switch (str[pos]) {
710  case '-':
711  sign = true;
712  /*FALLTHRU*/
713  case '+':
714  pos++;
715  break;
716  default:
717  if (flags & fMandatorySign) {
719  }
720  break;
721  }
722  SIZE_TYPE pos0 = pos;
723  // Check radix base
724  if ( !s_CheckRadix(str, pos, base) ) {
725  S2N_CONVERT_ERROR_RADIX(Int8, "bad numeric base '" +
726  NStr::IntToString(base)+ "'");
727  }
728 
729  // Begin conversion
730  Int8 n = 0;
731  Int8 limdiv = base==10? kMax_I8 / 10: kMax_I8 / base;
732  Int8 limoff = (base==10? kMax_I8 % 10: kMax_I8 % base) + (sign ? 1 : 0);
733 
734  // Number of symbols between two commas. '-1' means -- no comma yet.
735  int comma = -1;
736  SIZE_TYPE numpos = pos;
737 
738  while (char ch = str[pos]) {
739  int delta; // corresponding numeric value of 'ch'
740 
741  // Check on possible commas
742  CHECK_COMMAS;
743  // Sanity check
744  if ( !s_IsGoodCharForRadix(ch, base, &delta) ) {
745  break;
746  }
747  // Overflow check
748  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
750  }
751  n *= base;
752  n += delta;
753  pos++;
754  }
755 
756  // Last checks
757  if ( pos == pos0 || ((comma >= 0) && (comma != 3)) ) {
759  }
760  // Skip allowed trailing symbols
761  if (flags & fAllowTrailingSymbols) {
762  bool spaces = ((flags & fAllowTrailingSymbols) ==
764  s_SkipAllowedSymbols(str, pos, spaces ? eSkipSpacesOnly : eSkipAll, flags);
765  }
766  // Assign sign before the end pointer check
767  n = sign ? -n : n;
769 
770  return n;
771 }
772 
773 
775  TStringToNumFlags flags, int base)
776 {
777  S2N_CONVERT_GUARD(flags);
778 
779  const TStringToNumFlags slow_flags =
781 
782  if ( base == 10 && (flags & slow_flags) == 0 ) {
783  // fast conversion
784 
785  // Current position in the string
786  CTempString::const_iterator ptr = str.begin(), end = str.end();
787 
788  // Determine sign
789  if ( ptr != end && *ptr == '+' ) {
790  ++ptr;
791  }
792  if ( ptr == end ) {
793  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, ptr-str.begin());
794  }
795 
796  // Begin conversion
797  Uint8 n = 0;
798 
799  const Uint8 limdiv = kMax_UI8/10;
800  const int limoff = int(kMax_UI8 % 10);
801 
802  do {
803  char ch = *ptr;
804  int delta = ch - '0';
805  if ( unsigned(delta) >= 10 ) {
806  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, ptr-str.begin());
807  }
808  // Overflow check
809  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
810  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, ptr-str.begin());
811  }
812  n = n*10+delta;
813  } while ( ++ptr != end );
814 
815  return n;
816  }
817 
818  // Current position in the string
819  SIZE_TYPE pos = 0, size = str.size();
820 
821  // Skip allowed leading symbols
822  if (flags & fAllowLeadingSymbols) {
823  bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
824  s_SkipAllowedSymbols(str, pos,
825  spaces ? eSkipSpacesOnly : eSkipAllAllowed, flags);
826  }
827  // Determine sign
828  if (str[pos] == '+') {
829  pos++;
830  } else {
831  if (flags & fMandatorySign) {
833  }
834  }
835  SIZE_TYPE pos0 = pos;
836 
837  // Begin conversion
838  Uint8 n = 0;
839  // Check radix base
840  if ( !s_CheckRadix(str, pos, base) ) {
841  S2N_CONVERT_ERROR_RADIX(Uint8, "bad numeric base '" +
842  NStr::IntToString(base) + "'");
843  }
844 
845  Uint8 limdiv = kMax_UI8 / base;
846  int limoff = int(kMax_UI8 % base);
847 
848  // Number of symbols between two commas. '-1' means -- no comma yet.
849  int comma = -1;
850  SIZE_TYPE numpos = pos;
851 
852  while (char ch = str[pos]) {
853  int delta; // corresponding numeric value of 'ch'
854 
855  // Check on possible commas
856  CHECK_COMMAS;
857  // Sanity check
858  if ( !s_IsGoodCharForRadix(ch, base, &delta) ) {
859  break;
860  }
861  // Overflow check
862  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
864  }
865  n *= base;
866  n += delta;
867  pos++;
868  }
869 
870  // Last checks
871  if ( pos == pos0 || ((comma >= 0) && (comma != 3)) ) {
873  }
874  // Skip allowed trailing symbols
875  if (flags & fAllowTrailingSymbols) {
876  bool spaces = ((flags & fAllowTrailingSymbols) ==
878  s_SkipAllowedSymbols(str, pos, spaces ? eSkipSpacesOnly : eSkipAll, flags);
879  }
881  return n;
882 }
883 
884 
885 double NStr::StringToDoublePosix(const char* ptr, char** endptr, TStringToNumFlags flags)
886 {
888 
889  const char* start = ptr;
890  char c = *ptr++;
891 
892  // skip leading blanks
893  while ( isspace((unsigned char)c) ) {
894  c = *ptr++;
895  }
896 
897  int sign = 0;
898  if ( c == '-' ) {
899  sign = -1;
900  c = *ptr++;
901  }
902  else if ( c == '+' ) {
903  sign = +1;
904  c = *ptr++;
905  }
906 
907  if (c == 0) {
908  if (endptr) {
909  *endptr = (char*)start;
910  }
911  err_guard.Set(EINVAL);
912  return 0.;
913  }
914 
915  // short-cut - single digit
916  if ( !*ptr && c >= '0' && c <= '9' ) {
917  if (endptr) {
918  *endptr = (char*)ptr;
919  }
920  double result = c-'0';
921  // some compilers fail to negate zero
922  return sign < 0 ? (c == '0' ? -0. : -result) : result;
923  }
924 
925  bool dot = false, expn = false, anydigits = false;
926  int digits = 0, dot_position = 0;
927  unsigned int first=0, second=0, first_mul=1;
928  long double second_mul = NCBI_CONST_LONGDOUBLE(1.),
929  third = NCBI_CONST_LONGDOUBLE(0.);
930 
931  // up to exponent
932  for ( ; ; c = *ptr++ ) {
933  if (c >= '0' && c <= '9') {
934  // digits: accumulate
935  c = (char)(c - '0');
936  anydigits = true;
937  ++digits;
938  if (first == 0) {
939  first = c;
940  if ( first == 0 ) {
941  // omit leading zeros
942  --digits;
943  if (dot) {
944  --dot_position;
945  }
946  }
947  } else if (digits <= 9) {
948  // first 9 digits come to 'first'
949  first = first*10 + c;
950  } else if (digits <= 18) {
951  // next 9 digits come to 'second'
952  first_mul *= 10;
953  second = second*10 + c;
954  } else {
955  // other digits come to 'third'
956  second_mul *= NCBI_CONST_LONGDOUBLE(10.);
957  third = third * NCBI_CONST_LONGDOUBLE(10.) + c;
958  }
959  }
960  else if (c == '.') {
961  // dot
962  // if second dot, stop
963  if (dot) {
964  --ptr;
965  break;
966  }
967  dot_position = digits;
968  dot = true;
969  }
970  else if (c == 'e' || c == 'E') {
971  // if exponent, stop
972  if (!anydigits) {
973  --ptr;
974  break;
975  }
976  expn = true;
977  break;
978  }
979  else {
980  --ptr;
981  if (!anydigits) {
982  if ( !dot && (c == 'n' || c == 'N') &&
983  NStr::strncasecmp(ptr,"nan",3)==0) {
984  if (endptr) {
985  *endptr = (char*)(ptr+3);
986  }
987  return HUGE_VAL/HUGE_VAL; /* NCBI_FAKE_WARNING */
988  }
989  if ( (c == 'i' || c == 'I') ) {
990  if ( NStr::strncasecmp(ptr,"inf",3)==0) {
991  ptr += 3;
992  if ( NStr::strncasecmp(ptr,"inity",5)==0) {
993  ptr += 5;
994  }
995  if (endptr) {
996  *endptr = (char*)ptr;
997  }
998  return sign < 0 ? -HUGE_VAL : HUGE_VAL;
999  }
1000  }
1001  }
1002  break;
1003  }
1004  }
1005  // if no digits, stop now - error
1006  if (!anydigits) {
1007  if (endptr) {
1008  *endptr = (char*)start;
1009  }
1010  err_guard.Set(EINVAL);
1011  return 0.;
1012  }
1013  int exponent = dot ? dot_position - digits : 0;
1014 
1015  // read exponent
1016  if (expn && *ptr) {
1017  int expvalue = 0;
1018  bool expsign = false, expnegate= false;
1019  int expdigits= 0;
1020  for( ; ; ++ptr) {
1021  c = *ptr;
1022  // sign: should be no digits at this point
1023  if (c == '-' || c == '+') {
1024  // if there was sign or digits, stop
1025  if (expsign || expdigits) {
1026  break;
1027  }
1028  expsign = true;
1029  expnegate = c == '-';
1030  }
1031  // digits: accumulate
1032  else if (c >= '0' && c <= '9') {
1033  ++expdigits;
1034  int newexpvalue = expvalue*10 + (c-'0');
1035  if (newexpvalue > expvalue) {
1036  expvalue = newexpvalue;
1037  }
1038  }
1039  else {
1040  break;
1041  }
1042  }
1043  // if no digits, rollback
1044  if (!expdigits) {
1045  // rollback sign
1046  if (expsign) {
1047  --ptr;
1048  }
1049  // rollback exponent
1050  if (expn) {
1051  --ptr;
1052  }
1053  }
1054  else {
1055  exponent = expnegate ? exponent - expvalue : exponent + expvalue;
1056  }
1057  }
1058  long double ret;
1059  if ( first_mul > 1 ) {
1060  _ASSERT(first);
1061  ret = ((long double)first * first_mul + second)* second_mul + third;
1062  }
1063  else {
1064  _ASSERT(first_mul == 1);
1065  _ASSERT(second == 0);
1066  _ASSERT(second_mul == 1);
1067  _ASSERT(third == 0);
1068  ret = first;
1069  }
1070  // calculate exponent
1071  if ( first && exponent ) {
1072  // multiply by power of 10 only non-zero mantissa
1073  if (exponent > 2*DBL_MAX_10_EXP) {
1074  ret = (flags & fDecimalPosixFinite) ? DBL_MAX : HUGE_VAL;
1075  err_guard.Set(ERANGE);
1076  } else if (exponent < 2*DBL_MIN_10_EXP) {
1077  ret = (flags & fDecimalPosixFinite) ? DBL_MIN : 0.;
1078  err_guard.Set(ERANGE);
1079  } else {
1080  if ( exponent > 0 ) {
1081  static const double mul1[16] = {
1082  1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
1083  1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15
1084  };
1085  ret *= mul1[exponent&15];
1086  if ( exponent >>= 4 ) {
1087  static const long double mul2[16] = {
1088  NCBI_CONST_LONGDOUBLE(1e0),
1089  NCBI_CONST_LONGDOUBLE(1e16),
1090  NCBI_CONST_LONGDOUBLE(1e32),
1091  NCBI_CONST_LONGDOUBLE(1e48),
1092  NCBI_CONST_LONGDOUBLE(1e64),
1093  NCBI_CONST_LONGDOUBLE(1e80),
1094  NCBI_CONST_LONGDOUBLE(1e96),
1095  NCBI_CONST_LONGDOUBLE(1e112),
1096  NCBI_CONST_LONGDOUBLE(1e128),
1097  NCBI_CONST_LONGDOUBLE(1e144),
1098  NCBI_CONST_LONGDOUBLE(1e160),
1099  NCBI_CONST_LONGDOUBLE(1e176),
1100  NCBI_CONST_LONGDOUBLE(1e192),
1101  NCBI_CONST_LONGDOUBLE(1e208),
1102  NCBI_CONST_LONGDOUBLE(1e224),
1103  NCBI_CONST_LONGDOUBLE(1e240)
1104  };
1105  ret *= mul2[exponent&15];
1106  for ( exponent >>= 4; exponent; --exponent ) {
1107  ret *= NCBI_CONST_LONGDOUBLE(1e256);
1108  }
1109  }
1110  if (!finite(double(ret))) {
1111  if (flags & fDecimalPosixFinite) {
1112  ret = DBL_MAX;
1113  }
1114  err_guard.Set(ERANGE);
1115  }
1116  }
1117  else {
1118  exponent = -exponent;
1119  static const long double mul1[16] = {
1120  NCBI_CONST_LONGDOUBLE(1e-0),
1121  NCBI_CONST_LONGDOUBLE(1e-1),
1122  NCBI_CONST_LONGDOUBLE(1e-2),
1123  NCBI_CONST_LONGDOUBLE(1e-3),
1124  NCBI_CONST_LONGDOUBLE(1e-4),
1125  NCBI_CONST_LONGDOUBLE(1e-5),
1126  NCBI_CONST_LONGDOUBLE(1e-6),
1127  NCBI_CONST_LONGDOUBLE(1e-7),
1128  NCBI_CONST_LONGDOUBLE(1e-8),
1129  NCBI_CONST_LONGDOUBLE(1e-9),
1130  NCBI_CONST_LONGDOUBLE(1e-10),
1131  NCBI_CONST_LONGDOUBLE(1e-11),
1132  NCBI_CONST_LONGDOUBLE(1e-12),
1133  NCBI_CONST_LONGDOUBLE(1e-13),
1134  NCBI_CONST_LONGDOUBLE(1e-14),
1135  NCBI_CONST_LONGDOUBLE(1e-15)
1136  };
1137  ret *= mul1[exponent&15];
1138  if ( exponent >>= 4 ) {
1139  static const long double mul2[16] = {
1140  NCBI_CONST_LONGDOUBLE(1e-0),
1141  NCBI_CONST_LONGDOUBLE(1e-16),
1142  NCBI_CONST_LONGDOUBLE(1e-32),
1143  NCBI_CONST_LONGDOUBLE(1e-48),
1144  NCBI_CONST_LONGDOUBLE(1e-64),
1145  NCBI_CONST_LONGDOUBLE(1e-80),
1146  NCBI_CONST_LONGDOUBLE(1e-96),
1147  NCBI_CONST_LONGDOUBLE(1e-112),
1148  NCBI_CONST_LONGDOUBLE(1e-128),
1149  NCBI_CONST_LONGDOUBLE(1e-144),
1150  NCBI_CONST_LONGDOUBLE(1e-160),
1151  NCBI_CONST_LONGDOUBLE(1e-176),
1152  NCBI_CONST_LONGDOUBLE(1e-192),
1153  NCBI_CONST_LONGDOUBLE(1e-208),
1154  NCBI_CONST_LONGDOUBLE(1e-224),
1155  NCBI_CONST_LONGDOUBLE(1e-240)
1156  };
1157  ret *= mul2[exponent&15];
1158  for ( exponent >>= 4; exponent; --exponent ) {
1159  ret *= NCBI_CONST_LONGDOUBLE(1e-256);
1160  }
1161  }
1162  if ( ret < DBL_MIN ) {
1163  if (flags & fDecimalPosixFinite) {
1164  ret = DBL_MIN;
1165  }
1166  err_guard.Set(ERANGE);
1167  }
1168  }
1169  }
1170  }
1171  if ( sign < 0 ) {
1172  ret = -ret;
1173  }
1174  // done
1175  if (endptr) {
1176  *endptr = (char*)ptr;
1177  }
1178  return (double)ret;
1179 }
1180 
1181 
1182 /// @internal
1183 static double s_StringToDouble(const char* str, size_t size,
1185 {
1186  _ASSERT(str[size] == '\0');
1187  if ((flags & NStr::fDecimalPosix) && (flags & NStr::fDecimalPosixOrLocal)) {
1188  NCBI_THROW2(CStringException, eBadArgs,
1189  "NStr::StringToDouble(): mutually exclusive flags specified",0);
1190  }
1191  S2N_CONVERT_GUARD_EX(flags);
1192 
1193  // Current position in the string
1194  SIZE_TYPE pos = 0;
1195 
1196  // Skip allowed leading symbols
1197  if (flags & NStr::fAllowLeadingSymbols) {
1198  bool spaces = ((flags & NStr::fAllowLeadingSymbols) ==
1200  s_SkipAllowedSymbols(CTempString(str, size), pos,
1201  spaces ? eSkipSpacesOnly : eSkipAllAllowed, flags);
1202  }
1203  // Check mandatory sign
1204  if (flags & NStr::fMandatorySign) {
1205  switch (str[pos]) {
1206  case '-':
1207  case '+':
1208  break;
1209  default:
1210  S2N_CONVERT_ERROR_INVAL(double);
1211  }
1212  }
1213  // For consistency make additional check on incorrect leading symbols.
1214  // Because strtod() may just skip such symbols.
1215  if (!(flags & NStr::fAllowLeadingSymbols)) {
1216  char c = str[pos];
1217  if ( !isdigit((unsigned char)c) && !s_IsDecimalPoint(c,flags) && c != '-' && c != '+') {
1218  S2N_CONVERT_ERROR_INVAL(double);
1219  }
1220  }
1221 
1222  // Conversion
1223  int& errno_ref = errno;
1224  errno_ref = 0;
1225 
1226  char* endptr = 0;
1227  const char* begptr = str + pos;
1228 
1229  double n;
1230  if (flags & NStr::fDecimalPosix) {
1231  n = NStr::StringToDoublePosix(begptr, &endptr, flags);
1232  } else {
1233  n = strtod(begptr, &endptr);
1234  }
1235  if (flags & NStr::fDecimalPosixOrLocal) {
1236  char* endptr2 = 0;
1237  double n2 = NStr::StringToDoublePosix(begptr, &endptr2, flags);
1238  if (!endptr || (endptr2 && endptr2 > endptr)) {
1239  n = n2;
1240  endptr = endptr2;
1241  }
1242  }
1243  if ( !endptr || endptr == begptr ) {
1244  S2N_CONVERT_ERROR(double, kEmptyStr, EINVAL, s_DiffPtr(endptr, begptr) + pos);
1245  }
1246  // some libs set ERANGE, others do not
1247  // here, we do not consider ERANGE as error
1248  if ( errno_ref && errno_ref != ERANGE ) {
1249  S2N_CONVERT_ERROR(double, kEmptyStr, errno_ref, s_DiffPtr(endptr, begptr) + pos);
1250  }
1251  // special cases
1252  if ((flags & NStr::fDecimalPosixFinite) && n != 0. && !isnan(n))
1253  {
1254  bool is_negative = n < 0.;
1255  if (is_negative) {
1256  n = -n;
1257  }
1258  if ( n < DBL_MIN) {
1259  n = DBL_MIN;
1260  } else if (!finite(n)) {
1261  n = DBL_MAX;
1262  }
1263  if (is_negative) {
1264  n = -n;
1265  }
1266  }
1267 
1268  pos += s_DiffPtr(endptr, begptr);
1269 
1270  // Skip allowed trailing symbols
1271  if (flags & NStr::fAllowTrailingSymbols) {
1272  bool spaces = ((flags & NStr::fAllowTrailingSymbols) ==
1274  s_SkipAllowedSymbols(str, pos, spaces ? eSkipSpacesOnly : eSkipAll, flags);
1275  }
1276  CHECK_ENDPTR(double);
1277  return n;
1278 }
1279 
1280 
1281 double NStr::StringToDoubleEx(const char* str, size_t size,
1283 {
1284  return s_StringToDouble(str, size, flags);
1285 }
1286 
1287 
1289 {
1290  size_t size = str.size();
1291  if ( str.HasZeroAtEnd() ) {
1292  // string has zero at the end already
1293  return s_StringToDouble(str.data(), size, flags);
1294  }
1295  char buf[256]; // small temporary buffer on stack for appending zero char
1296  if ( size < sizeof(buf) ) {
1297  memcpy(buf, str.data(), size);
1298  buf[size] = '\0';
1299  return s_StringToDouble(buf, size, flags);
1300  }
1301  else {
1302  // use std::string() to allocate memory for appending zero char
1303  return s_StringToDouble(string(str).c_str(), size, flags);
1304  }
1305 }
1306 
1307 /// @internal
1309  SIZE_TYPE& pos,
1310  Uint8 value,
1312 {
1313  S2N_CONVERT_GUARD(flags);
1314 
1315  unsigned char ch = str[pos];
1316  if ( !ch ) {
1317  return value;
1318  }
1319 
1320  ch = (unsigned char)toupper(ch);
1321  Uint8 v = value;
1322  bool err = false;
1323 
1324  switch(ch) {
1325  case 'K':
1326  pos++;
1327  if ((kMax_UI8 / 1024) < v) {
1328  err = true;
1329  }
1330  v *= 1024;
1331  break;
1332  case 'M':
1333  pos++;
1334  if ((kMax_UI8 / 1024 / 1024) < v) {
1335  err = true;
1336  }
1337  v *= 1024 * 1024;
1338  break;
1339  case 'G':
1340  pos++;
1341  if ((kMax_UI8 / 1024 / 1024 / 1024) < v) {
1342  err = true;
1343  }
1344  v *= 1024 * 1024 * 1024;
1345  break;
1346  default:
1347  // error -- the "qual" points to the last unprocessed symbol
1349  }
1350  if ( err ) {
1351  S2N_CONVERT_ERROR_OVERFLOW(DataSize);
1352  }
1353 
1354  ch = str[pos];
1355  if ( ch && toupper(ch) == 'B' ) {
1356  pos++;
1357  }
1358  return v;
1359 }
1360 
1361 
1364  int base)
1365 {
1366  // We have a limited base range here
1367  if ( base < 2 || base > 16 ) {
1368  NCBI_THROW2(CStringException, eConvert,
1369  "Bad numeric base '" + NStr::IntToString(base)+ "'", 0);
1370  }
1371  S2N_CONVERT_GUARD_EX(flags);
1372 
1373  // Current position in the string
1374  SIZE_TYPE pos = 0;
1375 
1376  // Find end of number representation
1377  {{
1378  // Skip allowed leading symbols
1379  if (flags & fAllowLeadingSymbols) {
1380  bool spaces = ((flags & fAllowLeadingSymbols) ==
1382  s_SkipAllowedSymbols(str, pos,
1383  spaces ? eSkipSpacesOnly : eSkipAllAllowed, flags);
1384  }
1385  // Determine sign
1386  if (str[pos] == '+') {
1387  pos++;
1388  // strip fMandatorySign flag
1389  flags &= ~fMandatorySign;
1390  } else {
1391  if (flags & fMandatorySign) {
1393  }
1394  }
1395  // Check radix base
1396  if ( !s_CheckRadix(str, pos, base) ) {
1397  S2N_CONVERT_ERROR_RADIX(Uint8, "bad numeric base '" +
1398  NStr::IntToString(base) + "'");
1399  }
1400  }}
1401 
1402  SIZE_TYPE numpos = pos;
1403  char ch = str[pos];
1404  while (ch) {
1405  if ( !s_IsGoodCharForRadix(ch, base) &&
1406  ((ch != ',') || !(flags & fAllowCommas)) ) {
1407  break;
1408  }
1409  ch = str[++pos];
1410  }
1411  // If string is empty, just use whole remaining string for conversion
1412  // (for correct error reporting)
1413  if (pos-numpos == 0) {
1414  pos = str.length();
1415  }
1416 
1417  // Convert to number
1418  Uint8 n = StringToUInt8(CTempString(str.data()+numpos, pos-numpos),
1419  flags, base);
1420  if ( !n && errno ) {
1421  // If exceptions are enabled that it has been already thrown.
1422  // The errno is also set, so just return a zero.
1423  return 0;
1424  }
1425  // Check trailer (KB, MB, ...)
1426  if ( ch ) {
1427  n = s_DataSizeConvertQual(str, pos, n, flags);
1428  }
1429  // Skip allowed trailing symbols
1430  if (flags & fAllowTrailingSymbols) {
1431  bool spaces = ((flags & fAllowTrailingSymbols) ==
1433  s_SkipAllowedSymbols(str, pos, spaces ? eSkipSpacesOnly : eSkipAll, flags);
1434  }
1436  return n;
1437 }
1438 
1439 
1441  TStringToNumFlags flags /* = 0 */)
1442 {
1443  TStringToNumFlags allowed_flags = fConvErr_NoThrow +
1444  fMandatorySign +
1445  fAllowCommas +
1448  fDS_ForceBinary +
1451 
1452  if ((flags & allowed_flags) != flags) {
1453  NCBI_THROW2(CStringException, eConvert, "Wrong set of flags", 0);
1454  }
1455  S2N_CONVERT_GUARD(flags);
1456 
1457  const char* str_ptr = str.data();
1458  const char* str_end = str_ptr + str.size();
1459  if (flags & fAllowLeadingSymbols) {
1460  bool allow_all = (flags & fAllowLeadingSymbols) != fAllowLeadingSpaces;
1461  for (; str_ptr < str_end; ++str_ptr) {
1462  char c = *str_ptr;
1463  if (isdigit(c))
1464  break;
1465  if (isspace(c))
1466  continue;
1467  if ((c == '+' || c == '-') && (flags & fMandatorySign)
1468  && str_ptr + 1 < str_end && isdigit(*(str_ptr + 1)))
1469  {
1470  break;
1471  }
1472  if (!allow_all)
1473  break;
1474  }
1475  }
1476 
1477  if (str_ptr < str_end && *str_ptr == '+') {
1478  ++str_ptr;
1479  }
1480  else if ((str_ptr < str_end && *str_ptr == '-')
1481  || (flags & fMandatorySign))
1482  {
1483  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1484  }
1485 
1486  const char* num_start = str_ptr;
1487  bool have_dot = false;
1488  bool allow_commas = (flags & fAllowCommas) != 0;
1489  bool allow_dot = (flags & fDS_ProhibitFractions) == 0;
1490  Uint4 digs_pre_dot = 0, digs_post_dot = 0;
1491 
1492  for (; str_ptr < str_end; ++str_ptr) {
1493  char c = *str_ptr;
1494  if (isdigit(c)) {
1495  if (have_dot)
1496  ++digs_post_dot;
1497  else
1498  ++digs_pre_dot;
1499  }
1500  else if (c == '.' && allow_dot) {
1501  if (have_dot || str_ptr == num_start)
1502  break;
1503  if (*(str_ptr - 1) == ',') {
1504  --str_ptr;
1505  break;
1506  }
1507  have_dot = true;
1508  }
1509  else if (c == ',' && allow_commas) {
1510  if (have_dot || str_ptr == num_start)
1511  break;
1512  if (*(str_ptr - 1) == ',') {
1513  --str_ptr;
1514  break;
1515  }
1516  }
1517  else
1518  break;
1519  }
1520  if (have_dot && digs_post_dot == 0)
1521  --str_ptr;
1522  else if (str_ptr > num_start && *(str_ptr - 1) == ',')
1523  --str_ptr;
1524 
1525  const char* num_end = str_ptr;
1526  if (num_start == num_end) {
1527  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1528  }
1529  if (str_ptr < str_end && *str_ptr == ' '
1530  && !(flags & fDS_ProhibitSpaceBeforeSuffix))
1531  {
1532  ++str_ptr;
1533  }
1534  char suff_c = 0;
1535  if (str_ptr < str_end)
1536  suff_c = (char)toupper(*str_ptr);
1537 
1538  static const char s_Suffixes[] = {'K', 'M', 'G', 'T', 'P', 'E'};
1539  static const char* const s_BinCoefs[] = {"1024", "1048576", "1073741824",
1540  "1099511627776",
1541  "1125899906842624",
1542  "1152921504606846976"};
1543  static const Uint4 s_NumSuffixes = (Uint4)(sizeof(s_Suffixes) / sizeof(s_Suffixes[0]));
1544 
1545  bool binary_suff = (flags & fDS_ForceBinary) != 0;
1546  Uint4 suff_idx = 0;
1547  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
1548  if (suff_c == s_Suffixes[suff_idx])
1549  break;
1550  }
1551  if (suff_idx < s_NumSuffixes) {
1552  ++str_ptr;
1553  if (str_ptr + 1 < str_end && toupper(*str_ptr) == 'I'
1554  && toupper(*(str_ptr + 1)) == 'B')
1555  {
1556  str_ptr += 2;
1557  binary_suff = true;
1558  }
1559  else if (str_ptr < str_end && toupper(*str_ptr) == 'B')
1560  ++str_ptr;
1561  }
1562  else if (suff_c == 'B') {
1563  ++str_ptr;
1564  }
1565  else if (*(str_ptr - 1) == ' ')
1566  --str_ptr;
1567 
1568  if (flags & fAllowTrailingSymbols) {
1569  bool allow_all = (flags & fAllowTrailingSymbols) != fAllowTrailingSpaces;
1570  for (; str_ptr < str_end; ++str_ptr) {
1571  char c = *str_ptr;
1572  if (isspace(c))
1573  continue;
1574  if (!allow_all)
1575  break;
1576  }
1577  }
1578  if (str_ptr != str_end) {
1579  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1580  }
1581 
1582  Uint4 orig_digs = digs_pre_dot + digs_post_dot;
1583  AutoArray<Uint1> orig_num(orig_digs);
1584  str_ptr = num_start;
1585  for (Uint4 i = 0; str_ptr < num_end; ++str_ptr) {
1586  if (*str_ptr == ',' || *str_ptr == '.')
1587  continue;
1588  orig_num[i++] = Uint1(*str_ptr - '0');
1589  }
1590 
1591  Uint1* num_to_conv = orig_num.get();
1592  Uint4 digs_to_conv = digs_pre_dot;
1593  AutoArray<Uint1> mul_num;
1594  if (binary_suff && suff_idx < s_NumSuffixes) {
1595  const char* coef = s_BinCoefs[suff_idx];
1596  Uint4 coef_size = Uint4(strlen(coef));
1597  mul_num = new Uint1[orig_digs + coef_size];
1598  memset(mul_num.get(), 0, orig_digs + coef_size);
1599  for (Uint4 coef_i = 0; coef_i < coef_size; ++coef_i) {
1600  Uint1 coef_d = Uint1(coef[coef_i] - '0');
1601  Uint1 carry = 0;
1602  Uint4 res_idx = orig_digs + coef_i;
1603  for (int orig_i = orig_digs - 1; orig_i >= 0; --orig_i, --res_idx) {
1604  Uint1 orig_d = orig_num[orig_i];
1605  Uint1 res_d = Uint1(coef_d * orig_d + carry + mul_num[res_idx]);
1606  carry = 0;
1607  while (res_d >= 10) {
1608  res_d = (Uint1)(res_d - 10); // res_d -= 10;
1609  ++carry;
1610  }
1611  mul_num[res_idx] = res_d;
1612  }
1613  _ASSERT(carry <= 9);
1614  for (; carry != 0; --res_idx) {
1615  Uint1 res_d = Uint1(mul_num[res_idx] + carry);
1616  carry = 0;
1617  while (res_d >= 10) {
1618  res_d = (Uint1)(res_d - 10); // res_d -= 10;
1619  ++carry;
1620  }
1621  mul_num[res_idx] = res_d;
1622  }
1623  }
1624  digs_to_conv = orig_digs + coef_size - digs_post_dot;
1625  num_to_conv = mul_num.get();
1626  while (digs_to_conv > 1 && *num_to_conv == 0) {
1627  --digs_to_conv;
1628  ++num_to_conv;
1629  }
1630  }
1631  else if (suff_idx < s_NumSuffixes) {
1632  Uint4 coef_size = (suff_idx + 1) * 3;
1633  if (coef_size <= digs_post_dot) {
1634  digs_to_conv += coef_size;
1635  digs_post_dot -= coef_size;
1636  }
1637  else {
1638  digs_to_conv += digs_post_dot;
1639  coef_size -= digs_post_dot;
1640  digs_post_dot = 0;
1641  mul_num = new Uint1[digs_to_conv + coef_size];
1642  memmove(mul_num.get(), num_to_conv, digs_to_conv);
1643  memset(mul_num.get() + digs_to_conv, 0, coef_size);
1644  num_to_conv = mul_num.get();
1645  digs_to_conv += coef_size;
1646  }
1647  }
1648 
1649  const Uint8 limdiv = kMax_UI8/10;
1650  const int limoff = int(kMax_UI8 % 10);
1651  Uint8 n = 0;
1652  for (Uint4 i = 0; i < digs_to_conv; ++i) {
1653  Uint1 d = num_to_conv[i];
1654  if (n >= limdiv && (n > limdiv || d > limoff)) {
1655  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, i);
1656  }
1657  n *= 10;
1658  n += d;
1659  }
1660  if (digs_post_dot != 0 && num_to_conv[digs_to_conv] >= 5) {
1661  if (n == kMax_UI8) {
1662  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, digs_to_conv);
1663  }
1664  ++n;
1665  }
1666  return n;
1667 }
1668 
1669 
1671  TStringToNumFlags flags, int base)
1672 {
1673 #if (SIZEOF_SIZE_T > 4)
1674  return StringToUInt8(str, flags, base);
1675 #else
1676  return StringToUInt(str, flags, base);
1677 #endif
1678 }
1679 
1680 
1681 
1682 /// @internal
1683 static void s_SignedToString(string& out_str,
1684  unsigned long value,
1685  long svalue,
1687  int base)
1688 {
1689  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1690  char buffer[kBufSize];
1691  char* pos = buffer + kBufSize;
1692 
1693  if ( base == 10 ) {
1694  if ( svalue < 0 ) {
1695  value = static_cast<unsigned long>(-svalue);
1696  }
1697 
1698  if ( (flags & NStr::fWithCommas) ) {
1699  int cnt = -1;
1700  do {
1701  if (++cnt == 3) {
1702  *--pos = ',';
1703  cnt = 0;
1704  }
1705  unsigned long a = '0'+value;
1706  value /= 10;
1707  *--pos = char(a - value*10);
1708  } while ( value );
1709  }
1710  else {
1711  do {
1712  unsigned long a = '0'+value;
1713  value /= 10;
1714  *--pos = char(a - value*10);
1715  } while ( value );
1716  }
1717 
1718  if (svalue < 0)
1719  *--pos = '-';
1720  else if (flags & NStr::fWithSign)
1721  *--pos = '+';
1722  }
1723  else if ( base == 16 ) {
1724  do {
1725  *--pos = kDigit[value % 16];
1726  value /= 16;
1727  } while ( value );
1728  }
1729  else {
1730  do {
1731  *--pos = kDigit[value % base];
1732  value /= base;
1733  } while ( value );
1734  }
1735 
1736  out_str.assign(pos, buffer + kBufSize - pos);
1737 }
1738 
1739 
1740 void NStr::IntToString(string& out_str, int svalue,
1741  TNumToStringFlags flags, int base)
1742 {
1743  if ( base < 2 || base > 36 ) {
1744  CNcbiError::SetErrno(errno = EINVAL);
1745  return;
1746  }
1747  unsigned int value = static_cast<unsigned int>(svalue);
1748 
1749  if ( base == 10 && svalue < 0 ) {
1750  value = static_cast<unsigned int>(-svalue);
1751  }
1752  s_SignedToString(out_str, value, svalue, flags, base);
1753  errno = 0;
1754 }
1755 
1756 
1757 void NStr::LongToString(string& out_str, long svalue,
1758  TNumToStringFlags flags, int base)
1759 {
1760  if ( base < 2 || base > 36 ) {
1761  CNcbiError::SetErrno(errno = EINVAL);
1762  return;
1763  }
1764  unsigned long value = static_cast<unsigned long>(svalue);
1765 
1766  if ( base == 10 && svalue < 0 ) {
1767  value = static_cast<unsigned long>(-svalue);
1768  }
1769  s_SignedToString(out_str, value, svalue, flags, base);
1770  errno = 0;
1771 }
1772 
1773 
1774 void NStr::ULongToString(string& out_str,
1775  unsigned long value,
1777  int base)
1778 {
1779  if ( base < 2 || base > 36 ) {
1780  CNcbiError::SetErrno(errno = EINVAL);
1781  return;
1782  }
1783  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1784  char buffer[kBufSize];
1785  char* pos = buffer + kBufSize;
1786 
1787  if ( base == 10 ) {
1788  if ( (flags & fWithCommas) ) {
1789  int cnt = -1;
1790  do {
1791  if (++cnt == 3) {
1792  *--pos = ',';
1793  cnt = 0;
1794  }
1795  unsigned long a = '0'+value;
1796  value /= 10;
1797  *--pos = char(a - value*10);
1798  } while ( value );
1799  }
1800  else {
1801  do {
1802  unsigned long a = '0'+value;
1803  value /= 10;
1804  *--pos = char(a - value*10);
1805  } while ( value );
1806  }
1807 
1808  if ( (flags & fWithSign) ) {
1809  *--pos = '+';
1810  }
1811  }
1812  else if ( base == 16 ) {
1813  do {
1814  *--pos = kDigit[value % 16];
1815  value /= 16;
1816  } while ( value );
1817  }
1818  else {
1819  do {
1820  *--pos = kDigit[value % base];
1821  value /= base;
1822  } while ( value );
1823  }
1824 
1825  out_str.assign(pos, buffer + kBufSize - pos);
1826  errno = 0;
1827 }
1828 
1829 
1830 
1831 // On some platforms division of Int8 is very slow,
1832 // so will try to optimize it working with chunks.
1833 // Works only for radix base == 10.
1834 
1835 #define PRINT_INT8_CHUNK 1000000000
1836 #define PRINT_INT8_CHUNK_SIZE 9
1837 
1838 /// @internal
1839 static char* s_PrintUint8(char* pos,
1840  Uint8 value,
1842  int base)
1843 {
1844  if ( base == 10 ) {
1845  if ( (flags & NStr::fWithCommas) ) {
1846  int cnt = -1;
1847 #ifdef PRINT_INT8_CHUNK
1848  // while n doesn't fit in Uint4 process the number
1849  // by 9-digit chunks within 32-bit Uint4
1850  while ( value & ~Uint8(Uint4(~0)) ) {
1851  Uint4 chunk = Uint4(value);
1852  value /= PRINT_INT8_CHUNK;
1853  chunk -= PRINT_INT8_CHUNK*Uint4(value);
1854  char* end = pos - PRINT_INT8_CHUNK_SIZE - 2; // 9-digit chunk should have 2 commas
1855  do {
1856  if (++cnt == 3) {
1857  *--pos = ',';
1858  cnt = 0;
1859  }
1860  Uint4 a = '0'+chunk;
1861  chunk /= 10;
1862  *--pos = char(a-10*chunk);
1863  } while ( pos != end );
1864  }
1865  // process all remaining digits in 32-bit number
1866  Uint4 chunk = Uint4(value);
1867  do {
1868  if (++cnt == 3) {
1869  *--pos = ',';
1870  cnt = 0;
1871  }
1872  Uint4 a = '0'+chunk;
1873  chunk /= 10;
1874  *--pos = char(a-10*chunk);
1875  } while ( chunk );
1876 #else
1877  do {
1878  if (++cnt == 3) {
1879  *--pos = ',';
1880  cnt = 0;
1881  }
1882  Uint8 a = '0'+value;
1883  value /= 10;
1884  *--pos = char(a - 10*value);
1885  } while ( value );
1886 #endif
1887  }
1888  else {
1889 #ifdef PRINT_INT8_CHUNK
1890  // while n doesn't fit in Uint4 process the number
1891  // by 9-digit chunks within 32-bit Uint4
1892  while ( value & ~Uint8(Uint4(~0)) ) {
1893  Uint4 chunk = Uint4(value);
1894  value /= PRINT_INT8_CHUNK;
1895  chunk -= PRINT_INT8_CHUNK*Uint4(value);
1896  char* end = pos - PRINT_INT8_CHUNK_SIZE;
1897  do {
1898  Uint4 a = '0'+chunk;
1899  chunk /= 10;
1900  *--pos = char(a-10*chunk);
1901  } while ( pos != end );
1902  }
1903  // process all remaining digits in 32-bit number
1904  Uint4 chunk = Uint4(value);
1905  do {
1906  Uint4 a = '0'+chunk;
1907  chunk /= 10;
1908  *--pos = char(a-10*chunk);
1909  } while ( chunk );
1910 #else
1911  do {
1912  Uint8 a = '0'+value;
1913  value /= 10;
1914  *--pos = char(a-10*value);
1915  } while ( value );
1916 #endif
1917  }
1918  }
1919  else if ( base == 16 ) {
1920  do {
1921  *--pos = kDigit[value % 16];
1922  value /= 16;
1923  } while ( value );
1924  }
1925  else {
1926  do {
1927  *--pos = kDigit[value % base];
1928  value /= base;
1929  } while ( value );
1930  }
1931  return pos;
1932 }
1933 
1934 
1935 void NStr::Int8ToString(string& out_str, Int8 svalue,
1936  TNumToStringFlags flags, int base)
1937 {
1938  if ( base < 2 || base > 36 ) {
1939  CNcbiError::SetErrno(errno = EINVAL);
1940  return;
1941  }
1942  Uint8 value;
1943  if (base == 10) {
1944  value = static_cast<Uint8>(svalue<0?-svalue:svalue);
1945  } else {
1946  value = static_cast<Uint8>(svalue);
1947  }
1948  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1949  char buffer[kBufSize];
1950 
1951  char* pos = s_PrintUint8(buffer + kBufSize, value, flags, base);
1952 
1953  if (base == 10) {
1954  if (svalue < 0)
1955  *--pos = '-';
1956  else if (flags & fWithSign)
1957  *--pos = '+';
1958  }
1959  out_str.assign(pos, buffer + kBufSize - pos);
1960  errno = 0;
1961 }
1962 
1963 
1964 void NStr::UInt8ToString(string& out_str, Uint8 value,
1965  TNumToStringFlags flags, int base)
1966 {
1967  if ( base < 2 || base > 36 ) {
1968  CNcbiError::SetErrno(errno = EINVAL);
1969  return;
1970  }
1971  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1972  char buffer[kBufSize];
1973 
1974  char* pos = s_PrintUint8(buffer + kBufSize, value, flags, base);
1975 
1976  if ( (base == 10) && (flags & fWithSign) ) {
1977  *--pos = '+';
1978  }
1979  out_str.assign(pos, buffer + kBufSize - pos);
1980  errno = 0;
1981 }
1982 
1983 
1984 void NStr::UInt8ToString_DataSize(string& out_str,
1985  Uint8 value,
1986  TNumToStringFlags flags /* = 0 */,
1987  unsigned int max_digits /* = 3 */)
1988 {
1989  TNumToStringFlags allowed_flags = fWithSign +
1990  fWithCommas +
1991  fDS_Binary +
1994  fDS_ShortSuffix +
1996 
1997  if ((flags & allowed_flags) != flags) {
1998  NCBI_THROW2(CStringException, eConvert, "Wrong set of flags", 0);
1999  }
2000 
2001  if (max_digits < 3)
2002  max_digits = 3;
2003 
2004  static const char s_Suffixes[] = {'K', 'M', 'G', 'T', 'P', 'E'};
2005  static const Uint4 s_NumSuffixes = Uint4(sizeof(s_Suffixes) / sizeof(s_Suffixes[0]));
2006 
2007  static const SIZE_TYPE kBufSize = 50;
2008  char buffer[kBufSize];
2009  char* num_start;
2010  char* dot_ptr;
2011  char* num_end;
2012  Uint4 digs_pre_dot, suff_idx;
2013 
2014  if (!(flags &fDS_Binary)) {
2015  static const Uint8 s_Coefs[] = {1000, 1000000, 1000000000,
2016  NCBI_CONST_UINT8(1000000000000),
2017  NCBI_CONST_UINT8(1000000000000000),
2018  NCBI_CONST_UINT8(1000000000000000000)};
2019  suff_idx = 0;
2020  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
2021  if (value < s_Coefs[suff_idx])
2022  break;
2023  }
2024  num_start = s_PrintUint8(buffer + kBufSize, value, 0, 10);
2025  num_start[-1] = '0';
2026  dot_ptr = buffer + kBufSize - 3 * suff_idx;
2027  digs_pre_dot = Uint4(dot_ptr - num_start);
2028  if (!(flags & fDS_NoDecimalPoint)) {
2029  num_end = min(buffer + kBufSize, dot_ptr + (max_digits - digs_pre_dot));
2030  }
2031  else {
2032  while (suff_idx > 0 && max_digits - digs_pre_dot >= 3) {
2033  --suff_idx;
2034  digs_pre_dot += 3;
2035  dot_ptr += 3;
2036  }
2037  num_end = dot_ptr;
2038  }
2039  char* round_dig = num_end - 1;
2040  if (num_end < buffer + kBufSize && *num_end >= '5')
2041  ++(*round_dig);
2042  while (*round_dig == '0' + 10) {
2043  *round_dig = '0';
2044  --round_dig;
2045  ++(*round_dig);
2046  }
2047  if (round_dig < num_start) {
2048  _ASSERT(num_start - round_dig == 1);
2049  num_start = round_dig;
2050  ++digs_pre_dot;
2051  if (!(flags & fDS_NoDecimalPoint)) {
2052  if (digs_pre_dot > 3) {
2053  ++suff_idx;
2054  digs_pre_dot -= 3;
2055  dot_ptr -= 3;
2056  }
2057  --num_end;
2058  }
2059  else {
2060  if (digs_pre_dot > max_digits) {
2061  ++suff_idx;
2062  digs_pre_dot -= 3;
2063  dot_ptr -= 3;
2064  num_end = dot_ptr;
2065  }
2066  }
2067  }
2068  }
2069  else {
2070  static const Uint8 s_Coefs[] = {1, 1024, 1048576, 1073741824,
2071  NCBI_CONST_UINT8(1099511627776),
2072  NCBI_CONST_UINT8(1125899906842624),
2073  NCBI_CONST_UINT8(1152921504606846976)};
2074 
2075  suff_idx = 1;
2076  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
2077  if (value < s_Coefs[suff_idx])
2078  break;
2079  }
2080  bool can_try_another = true;
2081 try_another_suffix:
2082  Uint8 mul_coef = s_Coefs[suff_idx - 1];
2083  Uint8 whole_num = value / mul_coef;
2084  if (max_digits == 3 && whole_num >= 1000) {
2085  ++suff_idx;
2086  goto try_another_suffix;
2087  }
2088  num_start = s_PrintUint8(buffer + kBufSize, whole_num, 0, 10);
2089  num_start[-1] = '0';
2090  digs_pre_dot = Uint4(buffer + kBufSize - num_start);
2091  if (max_digits - digs_pre_dot >= 3 && (flags & fDS_NoDecimalPoint)
2092  && suff_idx != 1 && can_try_another)
2093  {
2094  Uint4 new_suff = suff_idx - 1;
2095 try_even_more_suffix:
2096  Uint8 new_num = value / s_Coefs[new_suff - 1];
2097  char* new_start = s_PrintUint8(buffer + kBufSize / 2, new_num, 0, 10);
2098  Uint4 new_digs = Uint4(buffer + kBufSize / 2 - new_start);
2099  if (new_digs <= max_digits) {
2100  if (max_digits - digs_pre_dot >= 3 && new_suff != 1) {
2101  --new_suff;
2102  goto try_even_more_suffix;
2103  }
2104  suff_idx = new_suff;
2105  can_try_another = false;
2106  goto try_another_suffix;
2107  }
2108  if (new_suff != suff_idx - 1) {
2109  suff_idx = new_suff + 1;
2110  can_try_another = false;
2111  goto try_another_suffix;
2112  }
2113  }
2114  memcpy(buffer, num_start - 1, digs_pre_dot + 1);
2115  num_start = buffer + 1;
2116  dot_ptr = num_start + digs_pre_dot;
2117  Uint4 cnt_more_digs = 1;
2118  if (!(flags & fDS_NoDecimalPoint))
2119  cnt_more_digs += min(max_digits - digs_pre_dot, 3 * (suff_idx - 1));
2120  num_end = dot_ptr;
2121  Uint8 left_val = value - whole_num * mul_coef;
2122  do {
2123  left_val *= 10;
2124  Uint1 d = Uint1(left_val / mul_coef);
2125  *num_end = char(d + '0');
2126  ++num_end;
2127  left_val -= d * mul_coef;
2128  --cnt_more_digs;
2129  }
2130  while (cnt_more_digs != 0);
2131  --num_end;
2132 
2133  char* round_dig = num_end - 1;
2134  if (*num_end >= '5')
2135  ++(*round_dig);
2136  while (*round_dig == '0' + 10) {
2137  *round_dig = '0';
2138  --round_dig;
2139  ++(*round_dig);
2140  }
2141  if (round_dig < num_start) {
2142  _ASSERT(round_dig == buffer);
2143  num_start = round_dig;
2144  ++digs_pre_dot;
2145  if (digs_pre_dot > max_digits) {
2146  ++suff_idx;
2147  goto try_another_suffix;
2148  }
2149  if (num_end != dot_ptr)
2150  --num_end;
2151  }
2152  if (!(flags & fDS_NoDecimalPoint) && digs_pre_dot == 4
2153  && num_start[0] == '1' && num_start[1] == '0'
2154  && num_start[2] == '2' && num_start[3] == '4')
2155  {
2156  ++suff_idx;
2157  goto try_another_suffix;
2158  }
2159 
2160  --suff_idx;
2161  }
2162 
2163  out_str.erase();
2164  if (flags & fWithSign)
2165  out_str.append(1, '+');
2166  if (!(flags & fWithCommas) || digs_pre_dot <= 3) {
2167  out_str.append(num_start, digs_pre_dot);
2168  }
2169  else {
2170  Uint4 digs_first = digs_pre_dot % 3;
2171  out_str.append(num_start, digs_first);
2172  char* left_ptr = num_start + digs_first;
2173  Uint4 digs_left = digs_pre_dot - digs_first;
2174  while (digs_left != 0) {
2175  out_str.append(1, ',');
2176  out_str.append(left_ptr, 3);
2177  left_ptr += 3;
2178  digs_left -= 3;
2179  }
2180  }
2181  if (num_end != dot_ptr) {
2182  out_str.append(1, '.');
2183  out_str.append(dot_ptr, num_end - dot_ptr);
2184  }
2185 
2186  if (suff_idx == 0) {
2187  if (flags & fDS_PutBSuffixToo) {
2188  if (flags & fDS_PutSpaceBeforeSuffix)
2189  out_str.append(1, ' ');
2190  out_str.append(1, 'B');
2191  }
2192  }
2193  else {
2194  --suff_idx;
2195  if (flags & fDS_PutSpaceBeforeSuffix)
2196  out_str.append(1, ' ');
2197  out_str.append(1, s_Suffixes[suff_idx]);
2198  if (!(flags & fDS_ShortSuffix)) {
2199  if (flags & fDS_Binary)
2200  out_str.append(1, 'i');
2201  out_str.append(1, 'B');
2202  }
2203  }
2204  errno = 0;
2205 }
2206 
2207 
2208 // A maximal double precision used in the double to string conversion
2209 #if defined(NCBI_OS_MSWIN)
2210  const int kMaxDoublePrecision = 200;
2211 #else
2212  const int kMaxDoublePrecision = 308;
2213 #endif
2214 // A maximal size of a double value in a string form.
2215 // Exponent size + sign + dot + ending '\0' + max.precision
2217 
2218 
2219 void NStr::DoubleToString(string& out_str, double value,
2221 {
2223  if (precision >= 0 ||
2224  ((flags & fDoublePosix) && (!finite(value) || value == 0.))) {
2225  SIZE_TYPE n = DoubleToString(value, precision, buffer,
2226  kMaxDoubleStringSize, flags);
2227  buffer[n] = '\0';
2228  } else {
2229  const char* format;
2230  switch (flags & fDoubleGeneral) {
2231  case fDoubleFixed:
2232  format = "%f";
2233  break;
2234  case fDoubleScientific:
2235  format = "%e";
2236  break;
2237  case fDoubleGeneral: // default
2238  default:
2239  format = "%g";
2240  break;
2241  }
2242  ::sprintf(buffer, format, value);
2243  if (flags & fDoublePosix) {
2244  struct lconv* conv = localeconv();
2245  if ('.' != *(conv->decimal_point)) {
2246  char* pos = strchr(buffer, *(conv->decimal_point));
2247  if (pos) {
2248  *pos = '.';
2249  }
2250  }
2251  }
2252  }
2253  out_str = buffer;
2254  errno = 0;
2255 }
2256 
2257 
2259  char* buf, SIZE_TYPE buf_size,
2261 {
2263  int n = 0;
2264  if ((flags & fDoublePosix) && (!finite(value) || value == 0.)) {
2265  if (value == 0.) {
2266  double zero = 0.;
2267  if (memcmp(&value, &zero, sizeof(double)) == 0) {
2268  strcpy(buffer, "0");
2269  n = 2;
2270  } else {
2271  strcpy(buffer, "-0");
2272  n = 3;
2273  }
2274  } else if (isnan(value)) {
2275  strcpy(buffer, "NaN");
2276  n = 4;
2277  } else if (value > 0.) {
2278  strcpy(buffer, "INF");
2279  n = 4;
2280  } else {
2281  strcpy(buffer, "-INF");
2282  n = 5;
2283  }
2284  } else {
2285  if (precision > (unsigned int)kMaxDoublePrecision) {
2286  precision = (unsigned int)kMaxDoublePrecision;
2287  }
2288  const char* format;
2289  switch (flags & fDoubleGeneral) {
2290  case fDoubleScientific:
2291  format = "%.*e";
2292  break;
2293  case fDoubleGeneral:
2294  format = "%.*g";
2295  break;
2296  case fDoubleFixed: // default
2297  default:
2298  format = "%.*f";
2299  break;
2300  }
2301  n = ::sprintf(buffer, format, (int)precision, value);
2302  if (n < 0) {
2303  n = 0;
2304  }
2305  if (flags & fDoublePosix) {
2306  struct lconv* conv = localeconv();
2307  if ('.' != *(conv->decimal_point)) {
2308  char* pos = strchr(buffer, *(conv->decimal_point));
2309  if (pos) {
2310  *pos = '.';
2311  }
2312  }
2313  }
2314  }
2315  SIZE_TYPE n_copy = min((SIZE_TYPE) n, buf_size);
2316  memcpy(buf, buffer, n_copy);
2317  errno = 0;
2318  return n_copy;
2319 }
2320 
2321 
2322 char* s_ncbi_append_int2str(char* buffer, unsigned int value, size_t digits, bool zeros)
2323 {
2324  char* buffer_start = buffer;
2325  char* buffer_end = (buffer += digits-1);
2326  if (zeros) {
2327  do {
2328  *buffer-- = (char)(48 + (value % 10));
2329  value /= 10;
2330  } while (--digits);
2331  } else {
2332  do {
2333  *buffer-- = (char)(48 + (value % 10));
2334  } while (value /= 10);
2335 
2336  if (++buffer != buffer_start) {
2337  memmove(buffer_start, buffer, buffer_end-buffer+1);
2338  buffer_end -= buffer - buffer_start;
2339  }
2340  }
2341  return ++buffer_end;
2342 }
2343 
2344 
2345 #define __NLG NCBI_CONST_LONGDOUBLE
2346 
2348  char* buffer, SIZE_TYPE bufsize,
2349  int* dec, int* sign)
2350 {
2351  //errno = 0;
2352  *dec = *sign = 0;
2353  if (precision==0) {
2354  return 0;
2355  }
2356  if (precision > DBL_DIG) {
2357  precision = DBL_DIG;
2358  }
2359  if (val == 0.) {
2360  double zero = 0.;
2361  if (memcmp(&val, &zero, sizeof(double)) == 0) {
2362  *buffer='0';
2363  return 1;
2364  }
2365  *buffer='-';
2366  *(++buffer)='0';
2367  *sign = -1;
2368  return 2;
2369  }
2370  *sign = val < 0. ? -1 : 1;
2371  if (*sign < 0) {
2372  val = -val;
2373  }
2374  bool high_precision = precision > 9;
2375 
2376 // calculate exponent
2377  unsigned int exp=0;
2378  bool exp_positive = val >= 1.;
2379  unsigned int first, second=0;
2380  long double mult = __NLG(1.);
2381  long double value = val;
2382 
2383  if (exp_positive) {
2384  while (value>=__NLG(1.e256))
2385  {value/=__NLG(1.e256); exp+=256;}
2386  if (value >= __NLG(1.e16)) {
2387  if (value>=__NLG(1.e240)) {value*=__NLG(1.e-240); exp+=240;}
2388  else if (value>=__NLG(1.e224)) {value*=__NLG(1.e-224); exp+=224;}
2389  else if (value>=__NLG(1.e208)) {value*=__NLG(1.e-208); exp+=208;}
2390  else if (value>=__NLG(1.e192)) {value*=__NLG(1.e-192); exp+=192;}
2391  else if (value>=__NLG(1.e176)) {value*=__NLG(1.e-176); exp+=176;}
2392  else if (value>=__NLG(1.e160)) {value*=__NLG(1.e-160); exp+=160;}
2393  else if (value>=__NLG(1.e144)) {value*=__NLG(1.e-144); exp+=144;}
2394  else if (value>=__NLG(1.e128)) {value*=__NLG(1.e-128); exp+=128;}
2395  else if (value>=__NLG(1.e112)) {value*=__NLG(1.e-112); exp+=112;}
2396  else if (value>=__NLG(1.e96)) {value*=__NLG(1.e-96); exp+=96;}
2397  else if (value>=__NLG(1.e80)) {value*=__NLG(1.e-80); exp+=80;}
2398  else if (value>=__NLG(1.e64)) {value*=__NLG(1.e-64); exp+=64;}
2399  else if (value>=__NLG(1.e48)) {value*=__NLG(1.e-48); exp+=48;}
2400  else if (value>=__NLG(1.e32)) {value*=__NLG(1.e-32); exp+=32;}
2401  else if (value>=__NLG(1.e16)) {value*=__NLG(1.e-16); exp+=16;}
2402  }
2403  if (value< __NLG(1.)) {mult=__NLG(1.e+9); exp-= 1;}
2404  else if (value< __NLG(10.)) {mult=__NLG(1.e+8); }
2405  else if (value< __NLG(1.e2)) {mult=__NLG(1.e+7); exp+= 1;}
2406  else if (value< __NLG(1.e3)) {mult=__NLG(1.e+6); exp+= 2;}
2407  else if (value< __NLG(1.e4)) {mult=__NLG(1.e+5); exp+= 3;}
2408  else if (value< __NLG(1.e5)) {mult=__NLG(1.e+4); exp+= 4;}
2409  else if (value< __NLG(1.e6)) {mult=__NLG(1.e+3); exp+= 5;}
2410  else if (value< __NLG(1.e7)) {mult=__NLG(1.e+2); exp+= 6;}
2411  else if (value< __NLG(1.e8)) {mult= __NLG(10.); exp+= 7;}
2412  else if (value< __NLG(1.e9)) {mult= __NLG(1.); exp+= 8;}
2413  else if (value<__NLG(1.e10)) {mult= __NLG(0.1); exp+= 9;}
2414  else if (value<__NLG(1.e11)) {mult=__NLG(1.e-2); exp+=10;}
2415  else if (value<__NLG(1.e12)) {mult=__NLG(1.e-3); exp+=11;}
2416  else if (value<__NLG(1.e13)) {mult=__NLG(1.e-4); exp+=12;}
2417  else if (value<__NLG(1.e14)) {mult=__NLG(1.e-5); exp+=13;}
2418  else if (value<__NLG(1.e15)) {mult=__NLG(1.e-6); exp+=14;}
2419  else if (value<__NLG(1.e16)) {mult=__NLG(1.e-7); exp+=15;}
2420  else {mult=__NLG(1.e-8); exp+=16;}
2421  } else {
2422  while (value<=__NLG(1.e-256))
2423  {value*=__NLG(1.e256); exp+=256;}
2424  if (value <= __NLG(1.e-16)) {
2425  if (value<=__NLG(1.e-240)) {value*=__NLG(1.e240); exp+=240;}
2426  else if (value<=__NLG(1.e-224)) {value*=__NLG(1.e224); exp+=224;}
2427  else if (value<=__NLG(1.e-208)) {value*=__NLG(1.e208); exp+=208;}
2428  else if (value<=__NLG(1.e-192)) {value*=__NLG(1.e192); exp+=192;}
2429  else if (value<=__NLG(1.e-176)) {value*=__NLG(1.e176); exp+=176;}
2430  else if (value<=__NLG(1.e-160)) {value*=__NLG(1.e160); exp+=160;}
2431  else if (value<=__NLG(1.e-144)) {value*=__NLG(1.e144); exp+=144;}
2432  else if (value<=__NLG(1.e-128)) {value*=__NLG(1.e128); exp+=128;}
2433  else if (value<=__NLG(1.e-112)) {value*=__NLG(1.e112); exp+=112;}
2434  else if (value<=__NLG(1.e-96)) {value*=__NLG(1.e96); exp+=96;}
2435  else if (value<=__NLG(1.e-80)) {value*=__NLG(1.e80); exp+=80;}
2436  else if (value<=__NLG(1.e-64)) {value*=__NLG(1.e64); exp+=64;}
2437  else if (value<=__NLG(1.e-48)) {value*=__NLG(1.e48); exp+=48;}
2438  else if (value<=__NLG(1.e-32)) {value*=__NLG(1.e32); exp+=32;}
2439  else if (value<=__NLG(1.e-16)) {value*=__NLG(1.e16); exp+=16;}
2440  }
2441  if (value<__NLG(1.e-15)) {mult=__NLG(1.e24); exp+=16;}
2442  else if (value<__NLG(1.e-14)) {mult=__NLG(1.e23); exp+=15;}
2443  else if (value<__NLG(1.e-13)) {mult=__NLG(1.e22); exp+=14;}
2444  else if (value<__NLG(1.e-12)) {mult=__NLG(1.e21); exp+=13;}
2445  else if (value<__NLG(1.e-11)) {mult=__NLG(1.e20); exp+=12;}
2446  else if (value<__NLG(1.e-10)) {mult=__NLG(1.e19); exp+=11;}
2447  else if (value<__NLG(1.e-9)) {mult=__NLG(1.e18); exp+=10;}
2448  else if (value<__NLG(1.e-8)) {mult=__NLG(1.e17); exp+=9;}
2449  else if (value<__NLG(1.e-7)) {mult=__NLG(1.e16); exp+=8;}
2450  else if (value<__NLG(1.e-6)) {mult=__NLG(1.e15); exp+=7;}
2451  else if (value<__NLG(1.e-5)) {mult=__NLG(1.e14); exp+=6;}
2452  else if (value<__NLG(1.e-4)) {mult=__NLG(1.e13); exp+=5;}
2453  else if (value<__NLG(1.e-3)) {mult=__NLG(1.e12); exp+=4;}
2454  else if (value<__NLG(1.e-2)) {mult=__NLG(1.e11); exp+=3;}
2455  else if (value<__NLG(1.e-1)) {mult=__NLG(1.e10); exp+=2;}
2456  else if (value<__NLG(1.)) {mult=__NLG(1.e9); exp+=1;}
2457  else {mult=__NLG(1.e8); }
2458  }
2459 
2460 // get all digits
2461  long double t1 = value * mult;
2462  if (t1 >= __NLG(1.e9)) {
2463  first = 999999999;
2464  } else if (t1 < __NLG(1.e8)) {
2465  first = 100000000;
2466  t1 = first;
2467  } else {
2468  first = (unsigned int)t1;
2469  }
2470  if (high_precision) {
2471  long double t2 = (t1-first) * __NLG(1.e8);
2472  if (t2 >= __NLG(1.e8)) {
2473  second = 99999999;
2474  } else {
2475  second = (unsigned int)t2;
2476  }
2477  }
2478 
2479 // convert them into string
2480  bool use_ext_buffer = bufsize > 20;
2481  char tmp[32];
2482  char *digits = use_ext_buffer ? buffer : tmp;
2483  char *digits_end = s_ncbi_append_int2str(digits,first,9,false);
2484  if (high_precision) {
2485  digits_end = s_ncbi_append_int2str(digits_end,second,8,true);
2486  }
2487  size_t digits_len = digits_end - digits;
2488  size_t digits_got = digits_len;
2489  size_t digits_expected = high_precision ? 17 : 9;
2490 
2491 // get significant digits according to requested precision
2492  size_t pos = precision;
2493  if (digits_len > precision) {
2494  digits_len = precision;
2495 
2496  // this is questionable, but in fact,
2497  // improves the result (on average)
2498 #if 1
2499  if (high_precision) {
2500  if (digits[pos] == '4') {
2501  size_t pt = pos-1;
2502  while (pt != 0 && digits[--pt] == '9')
2503  ;
2504  if (pt != 0 && (pos-pt) > precision/2)
2505  digits[pos]='5';
2506  } else if (digits[pos] == '5') {
2507  size_t pt = pos;
2508  while (pt != 0 && digits[--pt] == '0')
2509  ;
2510  if (pt != 0 && (pos-pt) > precision/2)
2511  digits[pos]='4';
2512  }
2513  }
2514 #endif
2515 
2516  if (digits[pos] >= '5') {
2517  do {
2518  if (digits[--pos] < '9') {
2519  ++digits[pos++];
2520  break;
2521  }
2522  digits[pos]='0';
2523  } while (pos > 0);
2524  if (pos == 0) {
2525  if (digits_expected <= digits_got) {
2526  if (exp_positive) {
2527  ++exp;
2528  } else {
2529 // exp cannot be 0, by design
2530  exp_positive = --exp == 0;
2531  }
2532  }
2533  *digits = '1';
2534  digits_len = 1;
2535  }
2536  }
2537  }
2538 
2539 // truncate trailing zeros
2540  for (pos = digits_len; pos-- > 0 && digits[pos] == '0';)
2541  --digits_len;
2542 
2543  *dec = (int)exp;
2544  if (!exp_positive) {
2545  *dec = -*dec;
2546  }
2547  if (!use_ext_buffer) {
2548  if (digits_len <= bufsize) {
2549  strncpy(buffer,digits,digits_len);
2550  } else {
2551  NCBI_THROW2(CStringException, eConvert,
2552  "Destination buffer too small", 0);
2553  }
2554  }
2555  return digits_len;
2556 }
2557 #undef __NLG
2558 
2559 
2561  char* buffer, SIZE_TYPE bufsize)
2562 {
2563  if (bufsize < precision+8) {
2564  NCBI_THROW2(CStringException, eConvert,
2565  "Destination buffer too small", 0);
2566  }
2567  int dec=0, sign=0;
2568  char digits[32];
2569  size_t digits_len = DoubleToString_Ecvt(
2570  val, precision, digits, sizeof(digits), &dec, &sign);
2571  if (digits_len == 0) {
2572  errno = 0;
2573  return 0;
2574  }
2575  if (val == 0.) {
2576  strncpy(buffer,digits, digits_len);
2577  return digits_len;
2578  }
2579  if (digits_len == 1 && dec == 0 && sign >=0) {
2580  *buffer = digits[0];
2581  errno = 0;
2582  return 1;
2583  }
2584  bool exp_positive = dec >= 0;
2585  unsigned int exp= (unsigned int)(exp_positive ? dec : (-dec));
2586 
2587  // assemble the result
2588  char *buffer_pos = buffer;
2589 // char *buffer_end = buffer + bufsize;
2590  char *digits_pos = digits;
2591 
2592  if (sign < 0) {
2593  *buffer_pos++ = '-';
2594  }
2595  // The 'e' format is used when the exponent of the value is less than -4
2596  // or greater than or equal to the precision argument
2597  if ((exp_positive && exp >= precision) || (!exp_positive && exp > 4)) {
2598  *buffer_pos++ = *digits_pos++;
2599  --digits_len;
2600  if (digits_len != 0) {
2601  *buffer_pos++ = '.';
2602  strncpy(buffer_pos,digits_pos,digits_len);
2603  buffer_pos += digits_len;
2604  }
2605  *buffer_pos++ = 'e';
2606  *buffer_pos++ = exp_positive ? '+' : '-';
2607 
2608 //#if defined(NCBI_OS_MSWIN)
2609 #if NCBI_COMPILER_MSVC && _MSC_VER < 1900
2610  bool need_zeros = true;
2611  size_t need_digits = 3;
2612 #else
2613  bool need_zeros = exp < 10 ? true : false;
2614  size_t need_digits = exp < 100 ? 2 : 3;
2615 #endif
2616  // assuming exp < 1000
2617  buffer_pos = s_ncbi_append_int2str(buffer_pos, exp, need_digits,need_zeros);
2618  } else if (exp_positive) {
2619  *buffer_pos++ = *digits_pos++;
2620  --digits_len;
2621  if (digits_len > exp) {
2622  strncpy(buffer_pos,digits_pos,exp);
2623  buffer_pos += exp;
2624  *buffer_pos++ = '.';
2625  strncpy(buffer_pos,digits_pos+exp,digits_len-exp);
2626  buffer_pos += digits_len-exp;
2627  } else {
2628  strncpy(buffer_pos,digits_pos,digits_len);
2629  buffer_pos += digits_len;
2630  exp -= (unsigned int)digits_len;
2631  while (exp--) {
2632  *buffer_pos++ = '0';
2633  }
2634  }
2635  } else {
2636  *buffer_pos++ = '0';
2637  *buffer_pos++ = '.';
2638  for (--exp; exp--;) {
2639  *buffer_pos++ = '0';
2640  }
2641  strncpy(buffer_pos,digits_pos, digits_len);
2642  buffer_pos += digits_len;
2643  }
2644  errno = 0;
2645  return buffer_pos - buffer;
2646 }
2647 
2648 
2650 {
2651 #if (SIZEOF_SIZE_T > 4)
2652  return UInt8ToString(value, flags, base);
2653 #else
2654  return UIntToString(static_cast<unsigned int>(value), flags, base);
2655 #endif
2656 }
2657 
2658 
2659 string NStr::PtrToString(const void* value)
2660 {
2661  errno = 0;
2662  char buffer[64];
2663  ::sprintf(buffer, "%p", value);
2664  return buffer;
2665 }
2666 
2667 
2668 void NStr::PtrToString(string& out_str, const void* value)
2669 {
2670  errno = 0;
2671  char buffer[64];
2672  ::sprintf(buffer, "%p", value);
2673  out_str = buffer;
2674 }
2675 
2676 
2678 {
2679  errno = 0;
2680  void *ptr = NULL;
2681  int res;
2682  if ( str.HasZeroAtEnd() ) {
2683  res = ::sscanf(str.data(), "%p", &ptr);
2684  } else {
2685  res = ::sscanf(string(str).c_str(), "%p", &ptr);
2686  }
2687  if (res != 1) {
2688  if (flags & fConvErr_NoErrMessage) {
2689  CNcbiError::SetErrno(errno = EINVAL);
2690  } else {
2691  CNcbiError::SetErrno(errno = EINVAL, str);
2692  }
2693  return NULL;
2694  }
2695  return ptr;
2696 }
2697 
2698 
2699 static const char* s_kTrueString = "true";
2700 static const char* s_kFalseString = "false";
2701 static const char* s_kTString = "t";
2702 static const char* s_kFString = "f";
2703 static const char* s_kYesString = "yes";
2704 static const char* s_kNoString = "no";
2705 static const char* s_kYString = "y";
2706 static const char* s_kNString = "n";
2707 
2708 
2709 const string NStr::BoolToString(bool value)
2710 {
2711  return value ? s_kTrueString : s_kFalseString;
2712 }
2713 
2714 
2716 {
2717  if ( AStrEquiv(str, s_kTrueString, PNocase()) ||
2718  AStrEquiv(str, s_kTString, PNocase()) ||
2719  AStrEquiv(str, s_kYesString, PNocase()) ||
2720  AStrEquiv(str, s_kYString, PNocase()) ) {
2721  errno = 0;
2722  return true;
2723  }
2724  if ( AStrEquiv(str, s_kFalseString, PNocase()) ||
2725  AStrEquiv(str, s_kFString, PNocase()) ||
2726  AStrEquiv(str, s_kNoString, PNocase()) ||
2727  AStrEquiv(str, s_kNString, PNocase()) ) {
2728  errno = 0;
2729  return false;
2730  }
2731  NCBI_THROW2(CStringException, eConvert,
2732  "String cannot be converted to bool", 0);
2733 }
2734 
2735 
2736 string NStr::FormatVarargs(const char* format, va_list args)
2737 {
2738 #ifdef HAVE_VASPRINTF
2739  char* s;
2740  int n = vasprintf(&s, format, args);
2741  if (n >= 0) {
2742  string str(s, n);
2743  free(s);
2744  return str;
2745  } else {
2746  return kEmptyStr;
2747  }
2748 
2749 #elif defined(NCBI_COMPILER_GCC) && defined(NO_PUBSYNC)
2750  CNcbiOstrstream oss;
2751  oss.vform(format, args);
2752  return CNcbiOstrstreamToString(oss);
2753 
2754 #elif defined(HAVE_VSNPRINTF)
2755  // deal with implementation quirks
2756  SIZE_TYPE size = 1024;
2757  AutoArray<char> buf(size);
2758  buf.get()[size-1] = buf.get()[size-2] = 0;
2759  SIZE_TYPE n = vsnprintf(buf.get(), size, format, args);
2760  while (n >= size || buf.get()[size-2]) {
2761  if (buf.get()[size-1]) {
2762  ERR_POST_X(1, Warning << "Buffer overrun by buggy vsnprintf");
2763  }
2764  size = max(size << 1, n);
2765  buf.reset(new char[size]);
2766  buf.get()[size-1] = buf.get()[size-2] = 0;
2767  n = vsnprintf(buf.get(), size, format, args);
2768  }
2769  return (n > 0) ? string(buf.get(), n) : kEmptyStr;
2770 
2771 #elif defined(HAVE_VPRINTF)
2772  char buf[1024];
2773  buf[sizeof(buf) - 1] = 0;
2774  vsprintf(buf, format, args);
2775  if (buf[sizeof(buf) - 1]) {
2776  ERR_POST_X(2, Warning << "Buffer overrun by vsprintf");
2777  }
2778  return buf;
2779 
2780 #else
2781 # error Please port this code to your system.
2782 #endif
2783 }
2784 
2785 
2787  const CTempString pattern,
2788  ECase use_case,
2789  EDirection direction,
2790  SIZE_TYPE occurence)
2791 {
2792  const SIZE_TYPE slen = str.length();
2793  const SIZE_TYPE plen = pattern.length();
2794  SIZE_TYPE current_occurence = 0;
2795  SIZE_TYPE pos = 0;
2796  SIZE_TYPE current_pos = 0; // saved position of last search
2797  SIZE_TYPE search_pos = 0; // next search position
2798 
2799  if (plen > slen) {
2800  return NPOS;
2801  }
2802 
2803  if (use_case == eCase) {
2804 
2805  if (direction == eForwardSearch) {
2806  do {
2807  pos = str.find(pattern, search_pos);
2808  if (pos == NPOS) {
2809  return NPOS;
2810  }
2811  current_pos = pos;
2812  search_pos = pos + plen;
2813  ++current_occurence;
2814  }
2815  while (current_occurence <= occurence);
2816 
2817  } else {
2818  _ASSERT(direction == eReverseSearch);
2819  search_pos = slen - plen;
2820  do {
2821  pos = str.rfind(pattern, search_pos);
2822  if (pos == NPOS) {
2823  return NPOS;
2824  }
2825  current_pos = pos;
2826  search_pos = (pos < plen) ? 0 : pos - plen;
2827  ++current_occurence;
2828  }
2829  while (current_occurence <= occurence);
2830  }
2831 
2832  } else {
2833  _ASSERT(use_case == eNocase);
2834 
2835  // A set of lower/upper characters for pattern[0].
2836  string x_first(pattern, 0, 1);
2837  if (isupper((unsigned char)x_first[0])) {
2838  x_first += (char)tolower((unsigned char)x_first[0]);
2839  } else if (islower((unsigned char)x_first[0])) {
2840  x_first += (char)toupper((unsigned char)x_first[0]);
2841  }
2842 
2843  if (direction == eForwardSearch) {
2844  do {
2845  pos = str.find_first_of(x_first, search_pos);
2846  while (pos != NPOS) {
2847  if ( (pos + plen) > slen ) {
2848  return NPOS;
2849  }
2850  if ( CompareNocase(str, pos, plen, pattern) == 0 ) {
2851  break;
2852  }
2853  pos = str.find_first_of(x_first, pos + 1);
2854  }
2855  if (pos > slen) {
2856  return NPOS;
2857  }
2858  current_pos = pos;
2859  search_pos = pos + plen;
2860  ++current_occurence;
2861  }
2862  while (current_occurence <= occurence);
2863 
2864  } else {
2865  _ASSERT(direction == eReverseSearch);
2866  search_pos = slen - plen;
2867  do {
2868  pos = str.find_last_of(x_first, search_pos);
2869  while (pos != NPOS && pos
2870  && CompareNocase(str, pos, plen, pattern) != 0) {
2871  if (pos == 0) {
2872  return NPOS;
2873  }
2874  pos = str.find_last_of(x_first, pos - 1);
2875  }
2876  current_pos = pos;
2877  search_pos = (pos < plen) ? 0 : pos - plen;
2878  ++current_occurence;
2879  }
2880  while (current_occurence <= occurence);
2881  }
2882  }
2883  return current_pos;
2884 }
2885 
2886 
2887 // @deprecated
2889  SIZE_TYPE start, SIZE_TYPE end, EOccurrence where)
2890 {
2891  string pat(pattern, 0, 1);
2892  SIZE_TYPE l = pattern.size();
2893  if (isupper((unsigned char) pat[0])) {
2894  pat += (char) tolower((unsigned char) pat[0]);
2895  } else if (islower((unsigned char) pat[0])) {
2896  pat += (char) toupper((unsigned char) pat[0]);
2897  }
2898 
2899  if (where == eFirst) {
2900  SIZE_TYPE pos = str.find_first_of(pat, start);
2901  while (pos != NPOS && (pos + l) <= end
2902  && CompareNocase(str, pos, l, pattern) != 0) {
2903  pos = str.find_first_of(pat, pos + 1);
2904  }
2905  return pos > end ? NPOS : pos;
2906 
2907  } else { // eLast
2908  SIZE_TYPE pos = str.find_last_of(pat, end);
2909  while (pos != NPOS && pos >= start
2910  && CompareNocase(str, pos, l, pattern) != 0) {
2911  if (pos == 0) {
2912  return NPOS;
2913  }
2914  pos = str.find_last_of(pat, pos - 1);
2915  }
2916  return pos < start ? NPOS : pos;
2917  }
2918 }
2919 
2920 
2921 const string* NStr::Find(const list <string>& lst, const CTempString val,
2922  ECase use_case)
2923 {
2924  if (lst.empty()) return NULL;
2925  ITERATE (list<string>, st_itr, lst) {
2926  if (Equal(*st_itr, val, use_case)) {
2927  return &*st_itr;
2928  }
2929  }
2930  return NULL;
2931 }
2932 
2933 const string* NStr::Find(const vector <string>& vec, const CTempString val,
2934  ECase use_case)
2935 {
2936  if (vec.empty()) return NULL;
2937  ITERATE (vector<string>, st_itr, vec) {
2938  if (Equal(*st_itr, val, use_case)) {
2939  return &*st_itr;
2940  }
2941  }
2942  return NULL;
2943 }
2944 
2945 
2946 /// @internal
2947 // Check that symbol 'ch' is a word boundary character (don't matches [a-zA-Z0-9_]).
2948 static inline
2950 {
2951  return !(ch == '_' || isalnum((unsigned char)ch));
2952 }
2953 
2954 
2956  const CTempString word,
2957  ECase use_case,
2958  EDirection direction)
2959 {
2960  const SIZE_TYPE slen = str.length();
2961  const SIZE_TYPE plen = word.length();
2962 
2963  SIZE_TYPE start = 0;
2964  SIZE_TYPE end = slen;
2965 
2966  SIZE_TYPE pos = Find(str, word, use_case, direction);
2967 
2968  while (pos != NPOS) {
2969  // Check word boundaries
2970  if ( ((pos == 0) || s_IsWordBoundaryChar(str[pos-1])) &&
2971  ((pos + plen == slen) || s_IsWordBoundaryChar(str[pos+plen])) ) {
2972  return pos;
2973  }
2974  // Find next occurrence
2975  if (direction == eForwardSearch) {
2976  if (pos + plen == slen) {
2977  return NPOS;
2978  }
2979  ++start;
2980  } else {
2981  if (pos == 0) {
2982  return NPOS;
2983  }
2984  --end;
2985  }
2986  pos = Find(CTempString(str, start, end - start), word, use_case, direction);
2987  if (pos != NPOS) {
2988  // update position: from start of the string "str"
2989  pos += start;
2990  }
2991  }
2992  return pos;
2993 }
2994 
2995 
2997 {
2998  const SIZE_TYPE len1 = s1.length();
2999  const SIZE_TYPE len2 = s2.length();
3000 
3001  // Eliminate the null case
3002  if (len1 == 0 || len2 == 0) {
3003  return 0;
3004  }
3005  SIZE_TYPE len = min(len1, len2);
3006 
3007  // Truncate the longer string
3008  CTempString t1, t2;
3009  if (len1 > len2) {
3010  t1 = s1.substr(len1-len, len);
3011  t2 = s2;
3012  } else {
3013  t1 = s1;
3014  t2 = s2.substr(0, len);
3015  }
3016  // Quick check for the worst case
3017  if (memcmp(t1.data(), t2.data(), len) == 0) {
3018  return len;
3019  }
3020 
3021  // Start by looking for a single character match
3022  // and increase length until no match is found.
3023  // Performance analysis: http://neil.fraser.name/news/2010/11/04/
3024  SIZE_TYPE best = 0;
3025  SIZE_TYPE n = 1;
3026  for (;;) {
3027  // Right 'n' symbols of 't1'
3028  CTempString pattern(t1.data() + len - n, n);
3029  SIZE_TYPE pos = t2.find(pattern);
3030  if (pos == NPOS) {
3031  return best;
3032  }
3033  n += pos;
3034  if (pos == 0 || memcmp(pattern.data(), t2.data(), n) == 0) {
3035  best = n;
3036  n++;
3037  }
3038  }
3039  // Unreachable
3040  return best;
3041 }
3042 
3043 
3044 template <class TStr>
3045 TStr s_TruncateSpaces(const TStr& str, NStr::ETrunc where,
3046  const TStr& empty_str)
3047 {
3048  SIZE_TYPE length = str.length();
3049  if (length == 0) {
3050  return empty_str;
3051  }
3052  SIZE_TYPE beg = 0;
3053  if (where == NStr::eTrunc_Begin || where == NStr::eTrunc_Both) {
3054  _ASSERT(beg < length);
3055  while ( isspace((unsigned char) str[beg]) ) {
3056  if (++beg == length) {
3057  return empty_str;
3058  }
3059  }
3060  }
3061  SIZE_TYPE end = length;
3062  if ( where == NStr::eTrunc_End || where == NStr::eTrunc_Both ) {
3063  _ASSERT(beg < end);
3064  while (isspace((unsigned char) str[--end])) {
3065  if (beg == end) {
3066  return empty_str;
3067  }
3068  }
3069  _ASSERT(beg <= end && !isspace((unsigned char) str[end]));
3070  ++end;
3071  }
3072  _ASSERT(beg < end && end <= length);
3073  if ( beg | (end - length) ) { // if either beg != 0 or end != length
3074  return str.substr(beg, end - beg);
3075  }
3076  else {
3077  return str;
3078  }
3079 }
3080 
3081 string NStr::TruncateSpaces(const string& str, ETrunc where)
3082 {
3083  return s_TruncateSpaces(str, where, kEmptyStr);
3084 }
3085 
3087 {
3088  return s_TruncateSpaces(str, where, CTempString());
3089 }
3090 
3092 {
3093  str = s_TruncateSpaces(str, where, CTempString());
3094 }
3095 
3096 void NStr::TruncateSpacesInPlace(string& str, ETrunc where)
3097 {
3098  SIZE_TYPE length = str.length();
3099  if (length == 0) {
3100  return;
3101  }
3102  SIZE_TYPE beg = 0;
3103  if ( where == eTrunc_Begin || where == eTrunc_Both ) {
3104  // It's better to use str.data()[] to check string characters
3105  // to avoid implicit modification of the string by non-const operator[]
3106  _ASSERT(beg < length);
3107  while ( isspace((unsigned char) str.data()[beg]) ) {
3108  if (++beg == length) {
3109  str.erase();
3110  return;
3111  }
3112  }
3113  }
3114 
3115  SIZE_TYPE end = length;
3116  if ( where == eTrunc_End || where == eTrunc_Both ) {
3117  // It's better to use str.data()[] to check string characters
3118  // to avoid implicit modification of the string by non-const operator[]
3119  _ASSERT(beg < end);
3120  while (isspace((unsigned char) str.data()[--end])) {
3121  if (beg == end) {
3122  str.erase();
3123  return;
3124  }
3125  }
3126  _ASSERT(beg <= end && !isspace((unsigned char) str.data()[end]));
3127  ++end;
3128  }
3129  _ASSERT(beg < end && end <= length);
3130 
3131 #if defined(NCBI_COMPILER_GCC) && (NCBI_COMPILER_VERSION == 304)
3132  // work around a library bug
3133  str.replace(end, length, kEmptyStr);
3134  str.replace(0, beg, kEmptyStr);
3135 #else
3136  if ( beg | (end - length) ) { // if either beg != 0 or end != length
3137  str.replace(0, length, str, beg, end - beg);
3138  }
3139 #endif
3140 }
3141 
3142 
3144  ECase use_case)
3145 {
3146  if (!str.length() ||
3147  !prefix.length() ||
3148  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3149  return;
3150  }
3151  str.erase(0, prefix.length());
3152 }
3153 
3154 
3156  ECase use_case)
3157 {
3158  if (!str.length() ||
3159  !prefix.length() ||
3160  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3161  return;
3162  }
3163  str.assign(str.data() + prefix.length(), str.length() - prefix.length());
3164 }
3165 
3166 
3168  ECase use_case)
3169 {
3170  if (!str.length() ||
3171  !prefix.length() ||
3172  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3173  return str;
3174  }
3175  return CTempString(str.data() + prefix.length(), str.length() - prefix.length());
3176 }
3177 
3178 
3180  ECase use_case)
3181 {
3182  if (!str.length() ||
3183  !suffix.length() ||
3184  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3185  return;
3186  }
3187  str.erase(str.length() - suffix.length());
3188 }
3189 
3190 
3192  ECase use_case)
3193 {
3194  if (!str.length() ||
3195  !suffix.length() ||
3196  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3197  return;
3198  }
3199  str.erase(str.length() - suffix.length());
3200 }
3201 
3202 
3204  ECase use_case)
3205 {
3206  if (!str.length() ||
3207  !suffix.length() ||
3208  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3209  return str;
3210  }
3211  return CTempString(str.data(), str.length() - suffix.length());
3212 }
3213 
3214 
3215 string& NStr::Replace(const string& src,
3216  const string& search, const string& replace,
3217  string& dst, SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3218  SIZE_TYPE* num_replace)
3219 {
3220  // source and destination should not be the same
3221  if (&src == &dst) {
3222  NCBI_THROW2(CStringException, eBadArgs,
3223  "NStr::Replace(): source and destination are the same",0);
3224  }
3225  if (num_replace)
3226  *num_replace = 0;
3227  if (start_pos + search.size() > src.size() || search == replace) {
3228  dst = src;
3229  return dst;
3230  }
3231 
3232  // Use different algorithms depending on size or 'search' and 'replace'
3233  // for better performance (and for big strings only! > 16KB).
3234 
3235  if (replace.size() > search.size() && src.size() > 16*1024) {
3236  // Replacing string is longer -- worst case.
3237  // Try to avoid memory reallocations inside std::string.
3238  // Count replacing strings first
3239  SIZE_TYPE n = 0;
3240  SIZE_TYPE start_orig = start_pos;
3241  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3242  start_pos = src.find(search, start_pos);
3243  if (start_pos == NPOS)
3244  break;
3245  n++;
3246  start_pos += search.size();
3247  }
3248  // Reallocate memory for destination string
3249  dst.resize(src.size() - n*search.size() + n*replace.size());
3250 
3251  // Use copy() to create destination string
3252  start_pos = start_orig;
3253  string::const_iterator src_start = src.begin();
3254  string::const_iterator src_end = src.begin();
3255  string::iterator dst_pos = dst.begin();
3256 
3257  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3258  start_pos = src.find(search, start_pos);
3259  if (start_pos == NPOS)
3260  break;
3261  // Copy from source string up to 'search'
3262  src_end = src.begin() + start_pos;
3263  copy(src_start, src_end, dst_pos);
3264  dst_pos += (src_end - src_start);
3265  // Append 'replace'
3266  copy(replace.begin(), replace.end(), dst_pos);
3267  dst_pos += replace.size();
3268  start_pos += search.size();
3269  src_start = src.begin() + start_pos;
3270  }
3271  // Copy source's string tail to the place
3272  copy(src_start, src.end(), dst_pos);
3273  if (num_replace)
3274  *num_replace = n;
3275 
3276  } else {
3277  // Replacing string is shorter or have the same length.
3278  // ReplaceInPlace() can be faster on some platform, but not much,
3279  // so we use regular algorithm even for equal lengths here.
3280  dst = src;
3281  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3282  start_pos = dst.find(search, start_pos);
3283  if (start_pos == NPOS)
3284  break;
3285  dst.replace(start_pos, search.size(), replace);
3286  start_pos += replace.size();
3287  if (num_replace)
3288  (*num_replace)++;
3289  }
3290  }
3291  return dst;
3292 }
3293 
3294 
3295 string NStr::Replace(const string& src,
3296  const string& search, const string& replace,
3297  SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3298  SIZE_TYPE* num_replace)
3299 {
3300  string dst;
3301  Replace(src, search, replace, dst, start_pos, max_replace, num_replace);
3302  return dst;
3303 }
3304 
3305 
3306 string& NStr::ReplaceInPlace(string& src,
3307  const string& search, const string& replace,
3308  SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3309  SIZE_TYPE* num_replace)
3310 {
3311  if ( num_replace )
3312  *num_replace = 0;
3313  if ( start_pos + search.size() > src.size() || search == replace )
3314  return src;
3315 
3316  bool equal_len = (search.size() == replace.size());
3317  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3318  start_pos = src.find(search, start_pos);
3319  if (start_pos == NPOS)
3320  break;
3321  // On some platforms string's replace() implementation
3322  // is not optimal if size of search and replace strings are equal
3323  if ( equal_len ) {
3324  copy(replace.begin(), replace.end(), src.begin() + start_pos);
3325  } else {
3326  src.replace(start_pos, search.size(), replace);
3327  }
3328  start_pos += replace.size();
3329  if (num_replace)
3330  (*num_replace)++;
3331  }
3332  return src;
3333 }
3334 
3335 
3336 template<typename TString, typename TContainer>
3337 TContainer& s_Split(const TString& str, const TString& delim,
3338  TContainer& arr, NStr::TSplitFlags flags,
3339  vector<SIZE_TYPE>* token_pos,
3340  CTempString_Storage* storage = NULL)
3341 {
3342  typedef CStrTokenPosAdapter<vector<SIZE_TYPE> > TPosArray;
3344  typedef CStrTokenize<TString, TContainer, TPosArray,
3345  CStrDummyTokenCount, TReserve> TSplitter;
3346 
3347  TPosArray token_pos_proxy(token_pos);
3348  TSplitter splitter(str, delim, flags, storage);
3349  splitter.Do(arr, token_pos_proxy, kEmptyStr);
3350  return arr;
3351 }
3352 
3353 #define CHECK_SPLIT_TEMPSTRING_FLAGS(where) \
3354  { \
3355  if ((flags & (NStr::fSplit_CanEscape | NStr::fSplit_CanQuote)) && !storage) { \
3356  NCBI_THROW2(CStringException, eBadArgs, \
3357  "NStr::" #where "(): the selected flags require non-NULL storage", 0); \
3358  } \
3359 }
3360 
3361 
3362 list<string>& NStr::Split(const CTempString str, const CTempString delim,
3363  list<string>& arr, TSplitFlags flags,
3364  vector<SIZE_TYPE>* token_pos)
3365 {
3366  return s_Split(str, delim, arr, flags, token_pos);
3367 }
3368 
3369 vector<string>& NStr::Split(const CTempString str, const CTempString delim,
3370  vector<string>& arr, TSplitFlags flags,
3371  vector<SIZE_TYPE>* token_pos)
3372 {
3373  return s_Split(str, delim, arr, flags, token_pos);
3374 }
3375 
3376 list<CTempString>& NStr::Split(const CTempString str, const CTempString delim,
3377  list<CTempString>& arr, TSplitFlags flags,
3378  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3379 {
3381  return s_Split(str, delim, arr, flags, token_pos, storage);
3382 }
3383 
3384 vector<CTempString>& NStr::Split(const CTempString str, const CTempString delim,
3385  vector<CTempString>& arr, TSplitFlags flags,
3386  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3387 {
3389  return s_Split(str, delim, arr, flags, token_pos, storage);
3390 }
3391 
3392 list<CTempStringEx>& NStr::Split(const CTempString str, const CTempString delim,
3393  list<CTempStringEx>& arr, TSplitFlags flags,
3394  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3395 {
3397  return s_Split(str, delim, arr, flags, token_pos, storage);
3398 }
3399 
3400 vector<CTempStringEx>& NStr::Split(const CTempString str, const CTempString delim,
3401  vector<CTempStringEx>& arr, TSplitFlags flags,
3402  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3403 {
3405  return s_Split(str, delim, arr, flags, token_pos, storage);
3406 }
3407 
3408 list<string>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3409  list<string>& arr, TSplitFlags flags,
3410  vector<SIZE_TYPE>* token_pos)
3411 {
3412  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos);
3413 }
3414 
3415 vector<string>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3416  vector<string>& arr, TSplitFlags flags,
3417  vector<SIZE_TYPE>* token_pos)
3418 {
3419  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos);
3420 }
3421 
3422 list<CTempString>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3423  list<CTempString>& arr, TSplitFlags flags,
3424  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3425 {
3427  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3428 }
3429 
3430 vector<CTempString>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3431  vector<CTempString>& arr, TSplitFlags flags,
3432  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3433 {
3435  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3436 }
3437 
3438 list<CTempStringEx>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3439  list<CTempStringEx>& arr, TSplitFlags flags,
3440  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3441 {
3443  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3444 }
3445 
3446 vector<CTempStringEx>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3447  vector<CTempStringEx>& arr, TSplitFlags flags,
3448  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3449 {
3451  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3452 }
3453 
3454 
3455 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3456  string& str1, string& str2, TSplitFlags flags)
3457 {
3458  CTempStringEx ts1, ts2;
3459  CTempString_Storage storage;
3460  bool result = SplitInTwo(str, delim, ts1, ts2, flags, &storage);
3461  str1 = ts1;
3462  str2 = ts2;
3463  return result;
3464 }
3465 
3466 
3467 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3468  CTempString& str1, CTempString& str2, TSplitFlags flags,
3469  CTempString_Storage* storage)
3470 {
3471  CTempStringEx ts1, ts2;
3472  bool result = SplitInTwo(str, delim, ts1, ts2, flags, storage);
3473  str1 = ts1;
3474  str2 = ts2;
3475  return result;
3476 }
3477 
3478 
3479 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3480  CTempStringEx& str1, CTempStringEx& str2,
3482 {
3487 
3488  CTempStringList part_collector(storage);
3489  TSplitter splitter(str, delim, flags, storage);
3490  SIZE_TYPE delim_pos = NPOS;
3491 
3492  // get first part
3493  splitter.Advance(&part_collector, &delim_pos);
3494  part_collector.Join(&str1);
3495  part_collector.Clear();
3496 
3497  // don't need further splitting, just quote and escape parsing
3498  splitter.SetDelim(kEmptyStr);
3499  splitter.Advance(&part_collector);
3500  part_collector.Join(&str2);
3501 
3502  return delim_pos != NPOS;
3503 }
3504 
3505 
3506 template <typename T>
3507 string s_NStr_Join(const T& arr, const CTempString delim)
3508 {
3509  if (arr.empty()) {
3510  return kEmptyStr;
3511  }
3512  typename T::const_iterator it = arr.begin();
3513  string result = *it;
3514  SIZE_TYPE needed = result.size();
3515 
3516  while (++it != arr.end()) {
3517  needed += delim.size() + it->size();
3518  }
3519  result.reserve(needed);
3520  it = arr.begin();
3521  while (++it != arr.end()) {
3522  result += delim;
3523  result += *it;
3524  }
3525  return result;
3526 }
3527 
3528 
3529 string NStr::Join(const list<string>& arr, const CTempString delim)
3530 {
3531  return s_NStr_Join(arr, delim);
3532 }
3533 
3534 
3535 string NStr::Join(const list<CTempString>& arr, const CTempString delim)
3536 {
3537  return s_NStr_Join(arr, delim);
3538 }
3539 
3540 
3541 string NStr::Join(const vector<string>& arr, const CTempString delim)
3542 {
3543  return s_NStr_Join(arr, delim);
3544 }
3545 
3546 
3547 string NStr::Join(const vector<CTempString>& arr, const CTempString delim)
3548 {
3549  return s_NStr_Join(arr, delim);
3550 }
3551 
3552 
3553 string NStr::Join(const set<string>& arr, const CTempString delim)
3554 {
3555  return s_NStr_Join(arr, delim);
3556 }
3557 
3558 
3559 string NStr::Join(const set<CTempString>& arr, const CTempString delim)
3560 {
3561  return s_NStr_Join(arr, delim);
3562 }
3563 
3564 
3565 // Auxiliary macros for NStr::Sanitize()
3566 
3567 #define SS_WRITE_SUBSTR \
3568  if (!out.get()) { \
3569  out.reset(new CNcbiOstrstream); \
3570  } \
3571  out->write(str.data() + pos, n_good); \
3572  n_good = 0; \
3573  pos = i
3574 
3575 #define SS_WRITE_SPACES \
3576  if (!out.get()) { \
3577  out.reset(new CNcbiOstrstream); \
3578  } \
3579  SIZE_TYPE n = (flags & fSS_NoMerge) ? n_spaces : 1; \
3580  for (SIZE_TYPE j = n; j > 0; j--) { \
3581  out->put(' '); \
3582  }
3583 
3585 {
3586  unique_ptr<CNcbiOstrstream> out;
3587  SIZE_TYPE i;
3588  SIZE_TYPE pos = 0; // start position of the substring
3589  SIZE_TYPE n_good = 0; // length of substring (good symbols)
3590  SIZE_TYPE n_spaces = 0; // number of accumulated spaces
3591 
3592  // Use fSS_print by default if no any other filter
3593  if ( !(flags & (fSS_alpha | fSS_digit | fSS_alnum |
3594  fSS_print | fSS_cntrl | fSS_punct)) ) {
3595  flags |= fSS_print;
3596  }
3597 
3598  for (i = 0; i < str.size(); i++)
3599  {
3600  char c = str[i];
3601 
3602  // Check on bare space.
3603  // Do not use ::isspace() here due it interference with ::iscntrl().
3604  if ( c == ' ' ) {
3605  // Spaces starts and we have good chars before? -- write it immediately
3606  if ( n_good ) {
3608  }
3609  // Count and skip all consecutive spaces
3610  ++n_spaces;
3611  continue;
3612  }
3613 
3614  // Check against filters
3615  bool allowed = ((flags & fSS_Reject) != 0);
3616  if (((flags & fSS_print) && isprint((unsigned char)c)) ||
3617  ((flags & fSS_alnum) && isalnum((unsigned char)c)) ||
3618  ((flags & fSS_alpha) && isalpha((unsigned char)c)) ||
3619  ((flags & fSS_digit) && isdigit((unsigned char)c)) ||
3620  ((flags & fSS_cntrl) && iscntrl((unsigned char)c)) ||
3621  ((flags & fSS_punct) && ispunct((unsigned char)c)) ) {
3622 
3623  // If matched and reverse logic -- treat char as rejected
3624  allowed = ((flags & fSS_Reject) == 0);
3625  }
3626 
3627  // Good character and no spaces before?
3628  if ( allowed && !n_spaces ) {
3629  // Continue to build substring or start new one
3630  if ( n_good ) {
3631  ++n_good;
3632  } else {
3633  n_good = 1;
3634  pos = i;
3635  }
3636  continue;
3637  }
3638 
3639  // Rejected character?
3640  if ( !allowed ) {
3641  // Write previously accumulated substring
3642  if ( n_good ) {
3644  }
3645  // Increase space pool or just ignore
3646  if ( !(flags & fSS_Remove) ) {
3647  ++n_spaces;
3648  }
3649  continue;
3650  }
3651 
3652  _ASSERT( allowed );
3653  _ASSERT( !n_good );
3654 
3655  // Otherwise need to process accumulated spaces first
3656  if ( n_spaces ) {
3657  // Don't write leading spaces
3658  if ( pos || (!pos && (flags & fSS_NoTruncate_Begin))) {
3660  }
3661  n_spaces = 0;
3662  }
3663  // Start new substring
3664  n_good = 1;
3665  pos = i;
3666 
3667  } /* for */
3668 
3669  // Some good characters?
3670  if ( n_good ) {
3671  if (i == n_good) {
3672  // All are good - return (a copy of) the original string
3673  return str;
3674  }
3675  if ( !out.get() ) {
3676  // All leading characters are bad - return a second part of
3677  // the original string, to avoid copying (below).
3678  return str.substr(pos, n_good);
3679  }
3680  // Write last accumulated substring
3682  }
3683 
3684  // Empty string, or all spaces (or rejected chars replaced with spaces)?
3685  if ( (i == n_spaces) || (n_spaces && !out.get()) ) {
3686  if (!n_spaces || ((flags & fSS_NoTruncate) != fSS_NoTruncate) ) {
3687  return kEmptyStr;
3688  }
3689  if (flags & fSS_NoMerge) {
3691  return CNcbiOstrstreamToString(*out);
3692  } else {
3693  return " ";
3694  }
3695  return kEmptyStr;
3696  }
3697 
3698  // Trailing spaces?
3699  if ( n_spaces ) {
3700  _ASSERT(out.get());
3701  if (flags & fSS_NoTruncate_End) {
3703  }
3704  }
3705  if (out.get()) {
3706  // Return sanitized string
3707  return CNcbiOstrstreamToString(*out);
3708  }
3709 
3710  // All rejected
3711  return kEmptyStr;
3712 }
3713 
3714 
3718 };
3719 
3720 
3721 static string s_PrintableString(const CTempString str,
3723  ELanguage lang)
3724 {
3725  unique_ptr<CNcbiOstrstream> out;
3726  SIZE_TYPE i, j = 0;
3727 
3728  for (i = 0; i < str.size(); i++) {
3729  bool octal = false;
3730  char c = str[i];
3731  switch (c) {
3732  case '\t':
3733  c = 't';
3734  break;
3735  case '\v':
3736  c = 'v';
3737  break;
3738  case '\b':
3739  c = 'b';
3740  break;
3741  case '\r':
3742  c = 'r';
3743  break;
3744  case '\f':
3745  c = 'f';
3746  break;
3747  case '\a':
3748  c = 'a';
3749  break;
3750  case '\n':
3751  if (!(mode & NStr::fNewLine_Passthru))
3752  c = 'n';
3753  /*FALLTHRU*/
3754  case '\\':
3755  case '\'':
3756  case '"':
3757  break;
3758  case '&':
3759  if (lang != eLanguage_Javascript)
3760  continue;
3761  break;
3762  default:
3763  if (!isascii((unsigned char) c)) {
3764  if (mode & NStr::fNonAscii_Quote) {
3765  octal = true;
3766  break;
3767  }
3768  }
3769  if (!isprint((unsigned char) c)) {
3770  octal = true;
3771  break;
3772  }
3773  continue;
3774  }
3775  if (!out.get()) {
3776  out.reset(new CNcbiOstrstream);
3777  }
3778  if (i > j) {
3779  out->write(str.data() + j, i - j);
3780  }
3781  out->put('\\');
3782  if (c == '\n') {
3783  out->write("n\\\n", 3);
3784  } else if (octal) {
3785  bool reduce;
3786  if (!(mode & NStr::fPrintable_Full)) {
3787  reduce = (i == str.size() - 1 ||
3788  str[i + 1] < '0' || str[i + 1] > '7' ? true : false);
3789  } else {
3790  reduce = false;
3791  }
3792  unsigned char v;
3793  char val[3];
3794  int k = 0;
3795  v = (unsigned char)((unsigned char)c >> 6);
3796  if (v || !reduce) {
3797  val[k++] = char('0' + v);
3798  reduce = false;
3799  }
3800  v = ((unsigned char)c >> 3) & 7;
3801  if (v || !reduce) {
3802  val[k++] = char('0' + v);
3803  }
3804  v = (unsigned char)c & 7;
3805  val[k++] = char('0' + v);
3806  out->write(val, k);
3807  } else {
3808  out->put(c);
3809  }
3810  j = i + 1;
3811  }
3812  if (j && i > j) {
3813  _ASSERT(out.get());
3814  out->write(str.data() + j, i - j);
3815  }
3816  if (out.get()) {
3817  // Return encoded string
3818  return CNcbiOstrstreamToString(*out);
3819  }
3820 
3821  // All characters are good - return (a copy of) the original string
3822  return str;
3823 }
3824 
3825 
3828 {
3829  return s_PrintableString(str, mode, eLanguage_C);
3830 }
3831 
3832 
3834 {
3835  return s_PrintableString(str,
3838 }
3839 
3840 
3841 string NStr::CEncode(const CTempString str, EQuoted quoted)
3842 {
3843  switch (quoted) {
3844  case eNotQuoted:
3845  return PrintableString(str);
3846  case eQuoted:
3847  return '"' + PrintableString(str) + '"';
3848  }
3849  _TROUBLE;
3850  // Unreachable
3851  return str;
3852 }
3853 
3854 
3855 string NStr::CParse(const CTempString str, EQuoted quoted)
3856 {
3857  if (quoted == eNotQuoted) {
3858  return ParseEscapes(str);
3859  }
3860  _ASSERT(quoted == eQuoted);
3861 
3862  SIZE_TYPE pos;
3863  SIZE_TYPE len = str.length();
3864  const char quote_char = '"';
3865 
3866  if (len < 2 || str[0] != quote_char || str[len-1] != quote_char) {
3867  NCBI_THROW2(CStringException, eFormat,
3868  "The source string must start and finish with a double quote", 0);
3869  }
3870 
3871  // Flag that next char is escaped, ignore it
3872  bool escaped = false;
3873  // We have a quote mark, start collect string chars
3874  bool collect = true;
3875  // Position of last quote
3876  SIZE_TYPE last_quote = 0;
3877 
3878  string out;
3879  out.reserve(str.size());
3880 
3881  for (pos = 1; pos < len; ++pos) {
3882  unsigned char ch = str[pos];
3883  if (ch == quote_char && !escaped) {
3884  // Have a substring
3885  CTempString sub(str.data() + last_quote + 1, pos - last_quote - 1);
3886  if (collect) {
3887  // Parse escape sequences and add it to result
3888  out += ParseEscapes(sub);
3889  } else {
3890  // Possible we have adjacent strings ("A""B").
3891  if (pos != last_quote + 1) {
3892  NCBI_THROW2(CStringException, eFormat,
3893  "Quoted string format error", pos);
3894  }
3895  }
3896  last_quote = pos;
3897  collect = !collect;
3898  } else {
3899  escaped = ch == '\\' ? !escaped : false;
3900  }
3901  }
3902  if (escaped || last_quote != len-1) {
3903  NCBI_THROW2(CStringException, eFormat,
3904  "Unterminated quoted string", str.length());
3905  }
3906  return out;
3907 }
3908 
3909 
3911 // http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent
3912 {
3913  string result;
3914  SIZE_TYPE i;
3915 
3916  // wild guess...
3917  result.reserve(str.size());
3918 
3919  for (i = 0; i < str.size(); i++) {
3920  char c = str[i];
3921  switch ( c ) {
3922  case '&':
3923  result.append("&amp;");
3924  break;
3925  case '<':
3926  result.append("&lt;");
3927  break;
3928  case '>':
3929  result.append("&gt;");
3930  break;
3931  case '\'':
3932  result.append("&apos;");
3933  break;
3934  case '"':
3935  result.append("&quot;");
3936  break;
3937  case '-':
3938  if (flags == eXmlEnc_CommentSafe) {
3939 // translate double hyphen and ending hyphen
3940 // http://www.w3.org/TR/xml11/#sec-comments
3941  if (i+1 == str.size()) {
3942  result.append("&#x2d;");
3943  break;
3944  } else if (str[i+1] == '-') {
3945  ++i;
3946  result.append(1, c).append("&#x2d;");
3947  break;
3948  }
3949  }
3950  result.append(1, c);
3951  break;
3952  default:
3953  if ((unsigned int)(c) < 0x20) {
3954  const char* charmap = "0123456789abcdef";
3955  result.append("&#x");
3956  Uint1 ch = c;
3957  unsigned hi = ch >> 4;
3958  unsigned lo = ch & 0xF;
3959  if ( hi ) {
3960  result.append(1, charmap[hi]);
3961  }
3962  result.append(1, charmap[lo]).append(1, ';');
3963  } else {
3964  result.append(1, c);
3965  }
3966  break;
3967  }
3968  }
3969  return result;
3970 }
3971 
3972 
3974 {
3975  string result;
3976  SIZE_TYPE i;
3977  SIZE_TYPE semicolon = 0;
3978 
3979  // wild guess...
3980  result.reserve(str.size());
3981 
3982  for (i = 0; i < str.size(); i++) {
3983  char c = str[i];
3984  switch ( c ) {
3985  case '&':
3986  {{
3987  result.append("&");
3988  // Check on HTML entity
3989  bool is_entity = false;
3990  if ((flags & fHtmlEnc_SkipEntities) &&
3991  (i+2 < str.size()) && (semicolon != NPOS)) {
3992 
3993  if ( i >= semicolon ) {
3994  semicolon = str.find(";", i+1);
3995  }
3996  if ( semicolon != NPOS ) {
3997  SIZE_TYPE len = semicolon - i;
3998  SIZE_TYPE p = i + 1;
3999  if (str[i+1] == '#') {
4000  // Check on numeric character reference encoding
4001  if (flags & fHtmlEnc_SkipNumericEntities) {
4002  p++;
4003  if (len || len <= 4) {
4004  for (; p < semicolon; ++p) {
4005  if (!isdigit((unsigned char)(str[p])))
4006  break;
4007  }
4008  }
4009  }
4010  } else {
4011  // Check on literal entity
4012  if (flags & fHtmlEnc_SkipLiteralEntities) {
4013  if (len && len <= 10) {
4014  for (; p < semicolon; ++p) {
4015  if (!isalpha((unsigned char)(str[p])))
4016  break;
4017  }
4018  }
4019  }
4020  }
4021  is_entity = (p == semicolon);
4022  }
4023  }
4024  if ( is_entity ) {
4025  if (flags & fHtmlEnc_CheckPreencoded) {
4026  ERR_POST_X_ONCE(5, Info << "string \"" << str <<
4027  "\" contains HTML encoded entities");
4028  }
4029  } else {
4030  result.append("amp;");
4031  }
4032  }}
4033  break;
4034  case '<':
4035  result.append("&lt;");
4036  break;
4037  case '>':
4038  result.append("&gt;");
4039  break;
4040  case '\'':
4041  result.append("&apos;");
4042  break;
4043  case '"':
4044  result.append("&quot;");
4045  break;
4046  default:
4047  if ((unsigned int)(c) < 0x20) {
4048  const char* charmap = "0123456789abcdef";
4049  result.append("&#x");
4050  Uint1 ch = c;
4051  unsigned hi = ch >> 4;
4052  unsigned lo = ch & 0xF;
4053  if ( hi ) {
4054  result.append(1, charmap[hi]);
4055  }
4056  result.append(1, charmap[lo]).append(1, ';');
4057  } else {
4058  result.append(1, c);
4059  }
4060  break;
4061  }
4062  }
4063  return result;
4064 }
4065 
4066 
4067 // Character entity references
4068 // http://www.w3.org/TR/html4/sgml/entities.html
4069 // http://www.w3.org/TR/1998/REC-html40-19980424/charset.html#h-5.3
4070 // only some entities from here were added (those shifted to right):
4071 // http://dev.w3.org/html5/html-author/charref
4072 
4073 static struct tag_HtmlEntities
4074 {
4076  const char* s;
4077 }
4078 const s_HtmlEntities[] = {
4079  { 9, "Tab" },
4080  { 10, "NewLine" },
4081  { 33, "excl" },
4082  { 34, "quot" },
4083  { 35, "num" },
4084  { 36, "dollar" },
4085  { 37, "percnt" },
4086  { 38, "amp" },
4087  { 39, "apos" },
4088  { 40, "lpar" },
4089  { 41, "rpar" },
4090  { 42, "ast" },
4091  { 43, "plus" },
4092  { 44, "comma" },
4093  { 46, "period" },
4094  { 47, "sol" },
4095  { 58, "colon" },
4096  { 59, "semi" },
4097  { 60, "lt" },
4098  { 61, "equals" },
4099  { 62, "gt" },
4100  { 63, "quest" },
4101  { 64, "commat" },
4102  { 91, "lsqb" },
4103  { 92, "bsol" },
4104  { 93, "rsqb" },
4105  { 94, "Hat" },
4106  { 95, "lowbar" },
4107  { 96, "grave" },
4108  { 123, "lcub" },
4109  { 124, "verbar" },
4110  { 125, "rcub" },
4111  { 160, "nbsp" },
4112  { 161, "iexcl" },
4113  { 162, "cent" },
4114  { 163, "pound" },
4115  { 164, "curren" },
4116  { 165, "yen" },
4117  { 166, "brvbar" },
4118  { 167, "sect" },
4119  { 168, "uml" },
4120  { 169, "copy" },
4121  { 170, "ordf" },
4122  { 171, "laquo" },
4123  { 172, "not" },
4124  { 173, "shy" },
4125  { 174, "reg" },
4126  { 175, "macr" },
4127  { 176, "deg" },
4128  { 177, "plusmn" },
4129  { 178, "sup2" },
4130  { 179, "sup3" },
4131  { 180, "acute" },
4132  { 181, "micro" },
4133  { 182, "para" },
4134  { 183, "middot" },
4135  { 184, "cedil" },
4136  { 185, "sup1" },
4137  { 186, "ordm" },
4138  { 187, "raquo" },
4139  { 188, "frac14" },
4140  { 189, "frac12" },
4141  { 190, "frac34" },
4142  { 191, "iquest" },
4143  { 192, "Agrave" },
4144  { 193, "Aacute" },
4145  { 194, "Acirc" },
4146  { 195, "Atilde" },
4147  { 196, "Auml" },
4148  { 197, "Aring" },
4149  { 198, "AElig" },
4150  { 199, "Ccedil" },
4151  { 200, "Egrave" },
4152  { 201, "Eacute" },
4153  { 202, "Ecirc" },
4154  { 203, "Euml" },
4155  { 204, "Igrave" },
4156  { 205, "Iacute" },
4157  { 206, "Icirc" },
4158  { 207, "Iuml" },
4159  { 208, "ETH" },
4160  { 209, "Ntilde" },
4161  { 210, "Ograve" },
4162  { 211, "Oacute" },
4163  { 212, "Ocirc" },
4164  { 213, "Otilde" },
4165  { 214, "Ouml" },
4166  { 215, "times" },
4167  { 216, "Oslash" },
4168  { 217, "Ugrave" },
4169  { 218, "Uacute" },
4170  { 219, "Ucirc" },
4171  { 220, "Uuml" },
4172  { 221, "Yacute" },
4173  { 222, "THORN" },
4174  { 223, "szlig" },
4175  { 224, "agrave" },
4176  { 225, "aacute" },
4177  { 226, "acirc" },
4178  { 227, "atilde" },
4179  { 228, "auml" },
4180  { 229, "aring" },
4181  { 230, "aelig" },
4182  { 231, "ccedil" },
4183  { 232, "egrave" },
4184  { 233, "eacute" },
4185  { 234, "ecirc" },
4186  { 235, "euml" },
4187  { 236, "igrave" },
4188  { 237, "iacute" },
4189  { 238, "icirc" },
4190  { 239, "iuml" },
4191  { 240, "eth" },
4192  { 241, "ntilde" },
4193  { 242, "ograve" },
4194  { 243, "oacute" },
4195  { 244, "ocirc" },
4196  { 245, "otilde" },
4197  { 246, "ouml" },
4198  { 247, "divide" },
4199  { 248, "oslash" },
4200  { 249, "ugrave" },
4201  { 250, "uacute" },
4202  { 251, "ucirc" },
4203  { 252, "uuml" },
4204  { 253, "yacute" },
4205  { 254, "thorn" },
4206  { 255, "yuml" },
4207  { 338, "OElig" },
4208  { 339, "oelig" },
4209  { 352, "Scaron" },
4210  { 353, "scaron" },
4211  { 376, "Yuml" },
4212  { 402, "fnof" },
4213  { 710, "circ" },
4214  { 732, "tilde" },
4215  { 913, "Alpha" },
4216  { 914, "Beta" },
4217  { 915, "Gamma" },
4218  { 916, "Delta" },
4219  { 917, "Epsilon" },
4220  { 918, "Zeta" },
4221  { 919, "Eta" },
4222  { 920, "Theta" },
4223  { 921, "Iota" },
4224  { 922, "Kappa" },
4225  { 923, "Lambda" },
4226  { 924, "Mu" },
4227  { 925, "Nu" },
4228  { 926, "Xi" },
4229  { 927, "Omicron" },
4230  { 928, "Pi" },
4231  { 929, "Rho" },
4232  { 931, "Sigma" },
4233  { 932, "Tau" },
4234  { 933, "Upsilon" },
4235  { 934, "Phi" },
4236  { 935, "Chi" },
4237  { 936, "Psi" },
4238  { 937, "Omega" },
4239  { 945, "alpha" },
4240  { 946, "beta" },
4241  { 947, "gamma" },
4242  { 948, "delta" },
4243  { 949, "epsilon" },
4244  { 950, "zeta" },
4245  { 951, "eta" },
4246  { 952, "theta" },
4247  { 953, "iota" },
4248  { 954, "kappa" },
4249  { 955, "lambda" },
4250  { 956, "mu" },
4251  { 957, "nu" },
4252  { 958, "xi" },
4253  { 959, "omicron" },
4254  { 960, "pi" },
4255  { 961, "rho" },
4256  { 962, "sigmaf" },
4257  { 963, "sigma" },
4258  { 964, "tau" },
4259  { 965, "upsilon" },
4260  { 966, "phi" },
4261  { 967, "chi" },
4262  { 968, "psi" },
4263  { 969, "omega" },
4264  { 977, "thetasym" },
4265  { 978, "upsih" },
4266  { 982, "piv" },
4267  { 8194, "ensp" },
4268  { 8195, "emsp" },
4269  { 8201, "thinsp" },
4270  { 8204, "zwnj" },
4271  { 8205, "zwj" },
4272  { 8206, "lrm" },
4273  { 8207, "rlm" },
4274  { 8211, "ndash" },
4275  { 8212, "mdash" },
4276  { 8216, "lsquo" },
4277  { 8217, "rsquo" },
4278  { 8218, "sbquo" },
4279  { 8220, "ldquo" },
4280  { 8221, "rdquo" },
4281  { 8222, "bdquo" },
4282  { 8224, "dagger" },
4283  { 8225, "Dagger" },
4284  { 8226, "bull" },
4285  { 8230, "hellip" },
4286  { 8240, "permil" },
4287  { 8242, "prime" },
4288  { 8243, "Prime" },
4289  { 8249, "lsaquo" },
4290  { 8250, "rsaquo" },
4291  { 8254, "oline" },
4292  { 8260, "frasl" },
4293  { 8364, "euro" },
4294  { 8472, "weierp" },
4295  { 8465, "image" },
4296  { 8476, "real" },
4297  { 8482, "trade" },
4298  { 8501, "alefsym" },
4299  { 8592, "larr" },
4300  { 8593, "uarr" },
4301  { 8594, "rarr" },
4302  { 8595, "darr" },
4303  { 8596, "harr" },
4304  { 8629, "crarr" },
4305  { 8656, "lArr" },
4306  { 8657, "uArr" },
4307  { 8658, "rArr" },
4308  { 8659, "dArr" },
4309  { 8660, "hArr" },
4310  { 8704, "forall" },
4311  { 8706, "part" },
4312  { 8707, "exist" },
4313  { 8709, "empty" },
4314  { 8711, "nabla" },
4315  { 8712, "isin" },
4316  { 8713, "notin" },
4317  { 8715, "ni" },
4318  { 8719, "prod" },
4319  { 8721, "sum" },
4320  { 8722, "minus" },
4321  { 8727, "lowast" },
4322  { 8730, "radic" },
4323  { 8733, "prop" },
4324  { 8734, "infin" },
4325  { 8736, "ang" },
4326  { 8743, "and" },
4327  { 8744, "or" },
4328  { 8745, "cap" },
4329  { 8746, "cup" },
4330  { 8747, "int" },
4331  { 8756, "there4" },
4332  { 8764, "sim" },
4333  { 8773, "cong" },
4334  { 8776, "asymp" },
4335  { 8800, "ne" },
4336  { 8801, "equiv" },
4337  { 8804, "le" },
4338  { 8805, "ge" },
4339  { 8834, "sub" },
4340  { 8835, "sup" },
4341  { 8836, "nsub" },
4342  { 8838, "sube" },
4343  { 8839, "supe" },
4344  { 8853, "oplus" },
4345  { 8855, "otimes" },
4346  { 8869, "perp" },
4347  { 8901, "sdot" },
4348  { 8968, "lceil" },
4349  { 8969, "rceil" },
4350  { 8970, "lfloor" },
4351  { 8971, "rfloor" },
4352  { 9001, "lang" },
4353  { 9002, "rang" },
4354  { 9674, "loz" },
4355  { 9824, "spades" },
4356  { 9827, "clubs" },
4357  { 9829, "hearts" },
4358  { 9830, "diams" },
4359  { 0, 0 }
4360 };
4361 
4363 {
4364  const struct tag_HtmlEntities* p = s_HtmlEntities;
4365  for ( ; p->u != 0; ++p) {
4366  if (uch == p->u) {
4367  return p->s;
4368  }
4369  }
4370  return kEmptyStr;
4371 }
4372 
4373 string NStr::HtmlDecode(const CTempString str, EEncoding encoding, THtmlDecode* result_flags)
4374 {
4375  string ustr;
4376  THtmlDecode result = 0;
4377 
4378  if (encoding == eEncoding_Unknown) {
4379  encoding = CUtf8::GuessEncoding(str);
4380  if (encoding == eEncoding_Unknown) {
4381  NCBI_THROW2(CStringException, eBadArgs,
4382  "Unable to guess the source string encoding", 0);
4383  }
4384  }
4385  // wild guess...
4386  ustr.reserve(str.size());
4387 
4388  CTempString::const_iterator i, e = str.end();
4389  char ch;
4391 
4392  for (i = str.begin(); i != e;) {
4393  ch = *(i++);
4394  //check for HTML entities and character references
4395  if (i != e && ch == '&') {
4396  CTempString::const_iterator itmp, end_of_entity, start_of_entity;
4397  itmp = end_of_entity = start_of_entity = i;
4398  bool ent, dec, hex, parsed=false;
4399  ent = isalpha((unsigned char)(*itmp)) != 0;
4400  dec = !ent && *itmp == '#' && ++itmp != e &&
4401  isdigit((unsigned char)(*itmp)) != 0;
4402  hex = !dec && itmp != e &&
4403  (*itmp == 'x' || *itmp == 'X') && ++itmp != e &&
4404  isxdigit((unsigned char)(*itmp)) != 0;
4405  start_of_entity = itmp;
4406 
4407  if (itmp != e && (ent || dec || hex)) {
4408  // do not look too far
4409  for (int len=0; len<16 && itmp != e; ++len, ++itmp) {
4410  if (*itmp == '&' || *itmp == '#') {
4411  break;
4412  }
4413  if (*itmp == ';') {
4414  end_of_entity = itmp;
4415  break;
4416  }
4417  ent = ent && isalnum( (unsigned char)(*itmp)) != 0;
4418  dec = dec && isdigit( (unsigned char)(*itmp)) != 0;
4419  hex = hex && isxdigit((unsigned char)(*itmp)) != 0;
4420  }
4421  if (end_of_entity != i && (ent || dec || hex)) {
4422  uch = 0;
4423  if (ent) {
4424  string entity(start_of_entity,end_of_entity);
4425  const struct tag_HtmlEntities* p = s_HtmlEntities;
4426  for ( ; p->u != 0; ++p) {
4427  if (entity.compare(p->s) == 0) {
4428  uch = p->u;
4429  parsed = true;
4430  result |= fHtmlDec_CharRef_Entity;
4431  break;
4432  }
4433  }
4434  } else {
4435  parsed = true;
4436  result |= fHtmlDec_CharRef_Numeric;
4437  for (itmp = start_of_entity; itmp != end_of_entity; ++itmp) {
4438  TUnicodeSymbol ud = *itmp;
4439  if (dec) {
4440  uch = 10 * uch + (ud - '0');
4441  } else if (hex) {
4442  if (ud >='0' && ud <= '9') {
4443  ud -= '0';
4444  } else if (ud >='a' && ud <= 'f') {
4445  ud -= 'a';
4446  ud += 10;
4447  } else if (ud >='A' && ud <= 'F') {
4448  ud -= 'A';
4449  ud += 10;
4450  }
4451  uch = 16 * uch + ud;
4452  }
4453  }
4454  }
4455  if (parsed) {
4456  ustr += CUtf8::AsUTF8(&uch,1);
4457  i = ++end_of_entity;
4458  continue;
4459  }
4460  }
4461  }
4462  }
4463  // no entity - append as is
4464  if (encoding == eEncoding_UTF8 || encoding == eEncoding_Ascii) {
4465  ustr.append( 1, ch );
4466  } else {
4467  result |= fHtmlDec_Encoding_Changed;
4468  ustr += CUtf8::AsUTF8(CTempString(&ch,1), encoding);
4469  }
4470  }
4471  if (result_flags) {
4472  *result_flags = result;
4473  }
4474  return ustr;
4475 }
4476 
4477 
4479 // http://www.json.org/
4480 {
4481  string result;
4482  SIZE_TYPE i;
4483  // wild guess...
4484  result.reserve(str.size());
4485 
4486  for (i = 0; i < str.size(); i++) {
4487  char c = str[i];
4488  switch ( c ) {
4489  case '"':
4490  result.append("\\\"");
4491  break;
4492  case '\\':
4493  result.append("\\\\");
4494  break;
4495  default:
4496  if ((unsigned int)c < 0x20 || (unsigned int)c >= 0x80) {
4497  static const char* charmap = "0123456789abcdef";
4498  result.append("\\u00");
4499  Uint1 ch = c;
4500  unsigned hi = ch >> 4;
4501  unsigned lo = ch & 0xF;
4502  result.append(1, charmap[hi]);
4503  result.append(1, charmap[lo]);
4504  } else {
4505  result.append(1, c);
4506  }
4507  break;
4508  }
4509  }
4510  return result;
4511 }
4512 
4513 
4514 string NStr::ShellEncode(const string& str)
4515 {
4516  // 1. Special-case of non-printable characters. We have no choice and
4517  // must use BASH extensions if we want printable output.
4518  //
4519  // Aesthetic issue: Most people are not familiar with the BASH-only
4520  // quoting style. Avoid it as much as possible.
4521 
4522  ITERATE ( string, it, str ) {
4523  if ( !isprint(Uchar(*it)) ) {
4524  return "$'" + NStr::PrintableString(str) + "'";
4525  }
4526  }
4527 
4528  /////////////////////////////////////////////////////////////////////////
4529  // Bourne Shell quoting as IEEE-standard without special extensions.
4530  //
4531  // There are 3 basic ways to quote/escape in Bourne Shell:
4532  //
4533  // - Single-quotes. All characters (including non-printable
4534  // characters newlines, backslashes), are literal. There is no escape.
4535  // - Double-quotes. Need to escape some metacharacters, such as literal
4536  // escape (\), variable expansion ($) and command substitution (`).
4537  // - Escape without quotes. Use backslash.
4538  /////////////////////////////////////////////////////////////////////////
4539 
4540  // 2. Non-empty printable string without meta-characters.
4541  //
4542  // Shell special characters, according to IEEE Std 1003.1,
4543  // plus ! (Bourne shell exit status negation and Bash history expansion),
4544  // braces (Bourne enhanced expansion), space, tab, and newline.
4545  //
4546  // See http://www.opengroup.org/onlinepubs/009695399/toc.htm
4547  // See Bourne and Bash man pages.
4548 
4549  if (!str.empty() &&
4550  str.find_first_of("!{} \t\r\n[|&;<>()$`\"'*?#~=%\\") == NPOS) {
4551  return str;
4552  }
4553 
4554  // 3. Printable string, but either empty or some shell meta-characters.
4555  //
4556  // Aesthetics preference:
4557  // i) If the string includes literal single-quotes, then prefer
4558  // double-quoting provided there is no need to escape embedded
4559  // literal double-quotes, escapes (\), variable substitution ($),
4560  // or command substitution (`).
4561 
4562  if (str.find('\'') != NPOS &&
4563  str.find_first_of("\"\\$`") == NPOS) {
4564  return "\"" + str + "\"";
4565  }
4566 
4567  // Use single-quoting. The only special case for Bourne shell
4568  // single-quoting is a literal single-quote, which needs to
4569  // be pulled out of the quoted region.
4570  //
4571  // Single-quoting does not have any escape character, so close
4572  // the quoted string ('), then emit an escaped or quoted literal
4573  // single-quote (\' or "'"), and resume the quoted string (').
4574  //
4575  // Aesthetics preferences:
4576  // ii) Prefer single-quoting over escape characters, especially
4577  // escaped whitespace. However, this is in compromise to optimal
4578  // quoting: if there are many literal single-quotes and the
4579  // use of double-quotes would involve the need to escape embedded
4580  // characters, then it may be more pleasing to escape the
4581  // shell meta-characters, and avoid the need for single-quoting
4582  // in the presence of literal single-quotes.
4583  // iii) If there are no literal double-quotes, then all else being equal,
4584  // avoid double-quotes and prefer escaping. Double-quotes are
4585  // more commonly used by enclosing formats such as ASN.1 Text
4586  // and CVS, and would thus need to be escaped. If there are
4587  // literal double-quotes, then having them is in the output is
4588  // unavoidable, and this aesthetics rule becomes secondary to
4589  // the preference for avoiding escape characters. If there are
4590  // literal escape characters, then having them is unavoidable
4591  // and avoidance of double-quotes is once again recommended.
4592 
4593  // TODO: Should simplify runs of multiple quotes, for example:
4594  // '\'''\'''\'' -> '"'''"'
4595 
4596  bool avoid_double_quotes = (str.find('"') == NPOS ||
4597  str.find('\\') != NPOS);
4598  string s = "'" + NStr::Replace(str, "'",
4599  avoid_double_quotes ? "'\\''" : "'\"'\"'") + "'";
4600 
4601  // Aesthetic improvement: Remove paired single-quotes ('')
4602  // that aren't escaped, as these evaluate to an empty string.
4603  // Don't apply this simplification for the degenerate case when
4604  // the string is the empty string ''. (Non degenerate strings
4605  // must be length greater than 2). Implement the equivalent
4606  // of the Perl regexp:
4607  //
4608  // s/(?<!\\)''//g
4609  //
4610  if (s.size() > 2) {
4611  size_t pos = 0;
4612  while ( true ) {
4613  pos = s.find("''", pos);
4614  if (pos == NPOS) break;
4615  if (pos == 0 || s[pos-1] != '\\') {
4616  s.erase(pos, 2);
4617  } else {
4618  ++pos;
4619  }
4620  }
4621  }
4622 
4623  return s;
4624 }
4625 
4626 
4627 string NStr::ParseEscapes(const CTempString str, EEscSeqRange mode, char user_char)
4628 {
4629  string out;
4630  out.reserve(str.size()); // result string can only be smaller
4631  SIZE_TYPE pos = 0;
4632  bool is_error = false;
4633 
4634  while (pos < str.size() || !is_error) {
4635  SIZE_TYPE pos2 = str.find('\\', pos);
4636  if (pos2 == NPOS) {
4637  //~ out += str.substr(pos);
4638  CTempString sub(str, pos);
4639  out += sub;
4640  break;
4641  }
4642  //~ out += str.substr(pos, pos2 - pos);
4643  CTempString sub(str, pos, pos2-pos);
4644  out += sub;
4645  if (++pos2 == str.size()) {
4646  NCBI_THROW2(CStringException, eFormat,
4647  "Unterminated escape sequence", pos2);
4648  }
4649  switch (str[pos2]) {
4650  case 'a': out += '\a'; break;
4651  case 'b': out += '\b'; break;
4652  case 'f': out += '\f'; break;
4653  case 'n': out += '\n'; break;
4654  case 'r': out += '\r'; break;
4655  case 't': out += '\t'; break;
4656  case 'v': out += '\v'; break;
4657  case 'x':
4658  {{
4659  pos = ++pos2;
4660  while (pos < str.size()
4661  && isxdigit((unsigned char) str[pos])) {
4662  pos++;
4663  }
4664  if (pos > pos2) {
4665  SIZE_TYPE len = pos-pos2;
4666  if ((mode == eEscSeqRange_FirstByte) && (len > 2)) {
4667  // Take only 2 first hex-digits
4668  len = 2;
4669  pos = pos2 + 2;
4670  }
4671  unsigned int value =
4672  StringToUInt(CTempString(str, pos2, len), 0, 16);
4673  if ((mode != eEscSeqRange_Standard) && (value > 255)) {
4674  // eEscSeqRange_Standard -- by default
4675  switch (mode) {
4677  // Already have right value
4678  break;
4679  case eEscSeqRange_Throw:
4680  NCBI_THROW2(CStringException, eFormat,
4681  "Escape sequence '" + string(CTempString(str, pos2, len)) +
4682  "' is out of range [0-255]", pos2);
4683  break;
4684  case eEscSeqRange_Errno:
4685  CNcbiError::SetErrno(errno = ERANGE, str);
4686  is_error = true;
4687  continue;
4688  case eEscSeqRange_User:
4689  value = (unsigned)user_char;
4690  break;
4691  default:
4692  NCBI_THROW2(CStringException, eFormat, "Wrong set of flags", pos2);
4693  }
4694  }
4695  out += static_cast<char>(value);
4696  } else {
4697  NCBI_THROW2(CStringException, eFormat,
4698  "\\x followed by no hexadecimal digits", pos);
4699  }
4700  }}
4701  continue;
4702  case '0': case '1': case '2': case '3':
4703  case '4': case '5': case '6': case '7':
4704  {{
4705  pos = pos2;
4706  unsigned char c = (unsigned char)(str[pos++] - '0');
4707  while (pos < pos2 + 3 && pos < str.size()
4708  && str[pos] >= '0' && str[pos] <= '7') {
4709  c = (unsigned char)((c << 3) | (str[pos++] - '0'));
4710  }
4711  out += c;
4712  }}
4713  continue;
4714  case '\n':
4715  // quoted EOL means no EOL
4716  break;
4717  default:
4718  out += str[pos2];
4719  break;
4720  }
4721  pos = pos2 + 1;
4722  }
4723  if (mode == eEscSeqRange_Errno) {
4724  if (is_error) {
4725  return kEmptyStr;
4726  }
4727  errno = 0;
4728  }
4729  return out;
4730 }
4731 
4732 
4733 string NStr::ParseQuoted(const CTempString str, size_t* n_read /*= NULL*/)
4734 {
4735  const char* str_pos = str.data();
4736  char quote_char;
4737 
4738  if (str.empty() || ((quote_char = *str_pos) != '"' && quote_char != '\'')) {
4739  NCBI_THROW2(CStringException, eFormat,
4740  "The source string must start with a quote", 0);
4741  }
4742 
4743  const char* str_end = str_pos + str.length();
4744  bool escaped = false;
4745 
4746  while (++str_pos < str_end) {
4747  if (*str_pos == quote_char && !escaped) {
4748  size_t pos = str_pos - str.data();
4749  if (n_read != NULL)
4750  *n_read = pos + 1;
4751  return ParseEscapes(CTempString(str.data() + 1, pos - 1));
4752  } else {
4753  escaped = *str_pos == '\\' ? !escaped : false;
4754  }
4755  }
4756  NCBI_THROW2(CStringException, eFormat,
4757  "Unterminated quoted string", str.length());
4758 }
4759 
4760 
4761 // Determines the end of an HTML <...> tag, accounting for attributes
4762 // and comments (the latter allowed only within <!...>).
4763 static SIZE_TYPE s_EndOfTag(const string& str, SIZE_TYPE start)
4764 {
4765  _ASSERT(start < str.size() && str[start] == '<');
4766  bool comments_ok = (start + 1 < str.size() && str[start + 1] == '!');
4767  for (SIZE_TYPE pos = start + 1; pos < str.size(); ++pos) {
4768  switch (str[pos]) {
4769  case '>': // found the end
4770  return pos;
4771 
4772  case '\"': // start of "string"; advance to end
4773  pos = str.find('\"', pos + 1);
4774  if (pos == NPOS) {
4775  NCBI_THROW2(CStringException, eFormat,
4776  "Unclosed string in HTML tag", start);
4777  // return pos;
4778  }
4779  break;
4780 
4781  case '-': // possible start of -- comment --; advance to end
4782  if (comments_ok && pos + 1 < str.size()
4783  && str[pos + 1] == '-') {
4784  pos = str.find("--", pos + 2);
4785  if (pos == NPOS) {
4786  NCBI_THROW2(CStringException, eFormat,
4787  "Unclosed comment in HTML tag", start);
4788  // return pos;
4789  } else {
4790  ++pos;
4791  }
4792  }
4793  }
4794  }
4795  NCBI_THROW2(CStringException, eFormat, "Unclosed HTML tag", start);
4796  // return NPOS;
4797 }
4798 
4799 
4800 // Determines the end of an HTML &foo; character/entity reference
4801 // (which might not actually end with a semicolon :-/ , but we ignore that case)
4802 static SIZE_TYPE s_EndOfReference(const string& str, SIZE_TYPE start)
4803 {
4804  _ASSERT(start < str.size() && str[start] == '&');
4805 
4806  SIZE_TYPE pos = str.find_first_not_of
4807  ("#0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
4808  start + 1);
4809  if (pos != NPOS && str[pos] == ';') {
4810  // found terminating semicolon, so it's valid, and we return that
4811  return pos;
4812  } else {
4813  // We consider it just a '&' by itself since it's invalid
4814  return start;
4815  }
4816 }
4817 
4818 
4819 static SIZE_TYPE s_VisibleHtmlWidth(const string& str)
4820 {
4821  SIZE_TYPE width = 0, pos = 0;
4822  for (;;) {
4823  SIZE_TYPE pos2 = str.find_first_of("<&", pos);
4824  if (pos2 == NPOS) {
4825  width += str.size() - pos;
4826  break;
4827  } else {
4828  width += pos2 - pos;
4829  if (str[pos2] == '&') {
4830  ++width;
4831  pos = s_EndOfReference(str, pos);
4832  } else {
4833  pos = s_EndOfTag(str, pos);
4834  }
4835  if (pos == NPOS) {
4836  break;
4837  } else {
4838  ++pos;
4839  }
4840  }
4841  }
4842  return width;
4843 }
4844 
4845 static
4846 inline bool _isspace(unsigned char c)
4847 {
4848  return ((c>=0x09 && c<=0x0D) || (c==0x20));
4849 }
4850 
4851 template<typename _D>
4852 void NStr::WrapIt(const string& str, SIZE_TYPE width,
4853  _D& dest, TWrapFlags flags,
4854  const string* prefix,
4855  const string* prefix1)
4856 {
4857  if (prefix == 0) {
4858  prefix = &kEmptyStr;
4859  }
4860 
4861  if (prefix1 == 0)
4862  prefix1 = prefix;
4863 
4864  SIZE_TYPE pos = 0, len = str.size(), nl_pos = 0;
4865 
4866  const bool is_html = flags & fWrap_HTMLPre ? true : false;
4867  const bool do_flat = (flags & fWrap_FlatFile) != 0;
4868  string temp_back; temp_back.reserve(width);
4869 
4870  enum EScore { // worst to best
4871  eForced,
4872  ePunct,
4873  eComma,
4874  eSpace,
4875  eNewline
4876  };
4877 
4878  // To avoid copying parts of str when we need to store a
4879  // substr of str, we store the substr as a pair
4880  // representing start (inclusive) and end (exclusive).
4881  typedef pair<SIZE_TYPE, SIZE_TYPE> TWrapSubstr;
4882 
4883  // This variable is used for HTML links that cross line boundaries.
4884  // Since it's aesthetically displeasing for a link to cross a boundary, we
4885  // close it at the end of each line and re-open it after the next line's
4886  // prefix
4887  // (This is needed in, e.g. AE017351)
4888  TWrapSubstr best_link(0, 0); // last link found before current best_pos
4889  TWrapSubstr latest_link(0, 0); // last link found at all
4890 
4891  while (pos < len) {
4892  bool hyphen = false; // "-" or empty
4893  SIZE_TYPE column = is_html ? s_VisibleHtmlWidth(*prefix1) : prefix1->size();
4894  SIZE_TYPE column0 = column;
4895  // the next line will start at best_pos
4896  SIZE_TYPE best_pos = NPOS;
4897  EScore best_score = eForced;
4898 
4899  // certain logic can be skipped if this part has no backspace,
4900  // which is, by far, the most common case
4901  bool thisPartHasBackspace = false;
4902 
4903  temp_back = *prefix1;
4904 
4905  // append any still-open links from previous lines
4906  if (is_html && best_link.second != 0) {
4907  temp_back.append(
4908  str.begin() + best_link.first,
4909  str.begin() + best_link.second);
4910  }
4911 
4912  SIZE_TYPE pos0 = pos;
4913 
4914  // we can't do this in HTML mode because we might have to deal with
4915  // link tags that go across lines.
4916  if (!is_html) {
4917  if (nl_pos <= pos) {
4918  nl_pos = str.find('\n', pos);
4919  if (nl_pos == NPOS) {
4920  nl_pos = len;
4921  }
4922  }
4923  if (column + (nl_pos - pos) <= width) {
4924  pos0 = nl_pos;
4925  }
4926  }
4927 
4928  for (SIZE_TYPE pos2 = pos0; pos2 < len && column <= width;
4929  ++pos2, ++column) {
4930  EScore score = eForced;
4931  SIZE_TYPE score_pos = pos2;
4932  const char c = str[pos2];
4933 
4934  if (c == '\n') {
4935  best_pos = pos2;
4936  best_score = eNewline;
4937  best_link = latest_link;
4938  break;
4939  }
4940  else if (_isspace((unsigned char)c)) {
4941  if (!do_flat && pos2 > 0 &&
4942  _isspace((unsigned char)str[pos2 - 1])) {
4943  if (pos2 < len - 1 && str[pos2 + 1] == '\b') {
4944  thisPartHasBackspace = true;
4945  }
4946  continue; // take the first space of a group
4947  }
4948  score = eSpace;
4949  }
4950  else if (is_html && c == '<') {
4951  // treat tags as zero-width...
4952  SIZE_TYPE start_of_tag = pos2;
4953  pos2 = s_EndOfTag(str, pos2);
4954  --column;
4955  if (pos2 == NPOS) {
4956  break;
4957  }
4958 
4959  if ((pos2 - start_of_tag) >= 6 &&
4960  str[start_of_tag + 1] == 'a' &&
4961  str[start_of_tag + 2] == ' ' &&
4962  str[start_of_tag + 3] == 'h' &&
4963  str[start_of_tag + 4] == 'r' &&
4964  str[start_of_tag + 5] == 'e' &&
4965  str[start_of_tag + 6] == 'f')
4966  {
4967  // remember current link in case of line wrap
4968  latest_link.first = start_of_tag;
4969  latest_link.second = pos2 + 1;
4970  }
4971  if ((pos2 - start_of_tag) >= 3 &&
4972  str[start_of_tag + 1] == '/' &&
4973  str[start_of_tag + 2] == 'a' &&
4974  str[start_of_tag + 3] == '>')
4975  {
4976  // link is closed
4977  latest_link.first = 0;
4978  latest_link.second = 0;
4979  }
4980  }
4981  else if (is_html && c == '&') {
4982  // ...and references as single characters
4983  pos2 = s_EndOfReference(str, pos2);
4984  if (pos2 == NPOS) {
4985  break;
4986  }
4987  }
4988  else if (c == ',' && column < width && score_pos < len - 1) {
4989  score = eComma;
4990  ++score_pos;
4991  }
4992  else if (do_flat ? c == '-' : ispunct((unsigned char)c)) {
4993  // For flat files, only whitespace, hyphens and commas
4994  // are special.
4995  switch (c) {
4996  case '(': case '[': case '{': case '<': case '`':
4997  score = ePunct;
4998  break;
4999  default:
5000  if (score_pos < len - 1 && column < width) {
5001  score = ePunct;
5002  ++score_pos;
5003  }
5004  break;
5005  }
5006  }
5007 
5008  if (score >= best_score && score_pos > pos0) {
5009  best_pos = score_pos;
5010  best_score = score;
5011  best_link = latest_link;
5012  }
5013 
5014  while (pos2 < len - 1 && str[pos2 + 1] == '\b') {
5015  // Account for backspaces
5016  ++pos2;
5017  if (column > column0) {
5018  --column;
5019  }
5020  thisPartHasBackspace = true;
5021  }
5022  }
5023 
5024  if (best_score != eNewline && column <= width) {
5025  if (best_pos != len) {
5026  // If the whole remaining text can fit, don't split it...
5027  best_pos = len;
5028  best_link = latest_link;
5029  // Force backspace checking, to play it safe
5030  thisPartHasBackspace = true;
5031  }
5032  }
5033  else if (best_score == eForced && (flags & fWrap_Hyphenate)) {
5034  hyphen = true;
5035  --best_pos;
5036  }
5037 
5038  {{
5039  string::const_iterator begin = str.begin() + pos;
5040  string::const_iterator end = str.begin() + best_pos;
5041  if (thisPartHasBackspace) {
5042  // eat backspaces and the characters (if any) that precede them
5043 
5044  string::const_iterator bs; // position of next backspace
5045  while ((bs = find(begin, end, '\b')) != end) {
5046  if (bs != begin) {
5047  // add all except the last one
5048  temp_back.append(begin, bs - 1);
5049  }
5050  else {
5051  // The backspace is at the beginning of next substring,
5052  // so we should remove previously added symbol if any.
5053  SIZE_TYPE size = temp_back.size();
5054  if (size > prefix1->size()) { // current size > prefix size
5055  temp_back.resize(size - 1);
5056  }
5057  }
5058  // skip over backspace
5059  begin = bs + 1;
5060  }
5061  }
5062  if (begin != end) {
5063  // add remaining characters
5064  temp_back.append(begin, end);
5065  }
5066  }}
5067 
5068  // if we didn't close the link on this line, we
5069  // close it here
5070  if (is_html && best_link.second != 0) {
5071  temp_back += "</a>";
5072  }
5073 
5074  if (hyphen) {
5075  temp_back += '-';
5076  }
5077  pos = best_pos;
5078  prefix1 = prefix;
5079 
5080  if (do_flat) {
5081  if (best_score == eSpace) {
5082  while (str[pos] == ' ') {
5083  ++pos;
5084  }
5085  if (str[pos] == '\n') {
5086  ++pos;
5087  }
5088  }
5089  if (best_score == eNewline) {
5090  ++pos;
5091  }
5092  }
5093  else {
5094  if (best_score == eSpace || best_score == eNewline) {
5095  ++pos;
5096  }
5097  }
5098  while (pos < len && str[pos] == '\b') {
5099  ++pos;
5100  }
5101 
5102  dest.Append(temp_back);
5103  }
5104 }
5105 
5106 
5107 void NStr::Wrap(const string& str, SIZE_TYPE width,
5108  IWrapDest& dest, TWrapFlags flags,
5109  const string* prefix,
5110  const string* prefix1)
5111 {
5112  WrapIt(str, width, dest, flags, prefix, prefix1);
5113 }
5114 
5115 
5116 list<string>& NStr::Wrap(const string& str, SIZE_TYPE width,
5117  list<string>& arr2, NStr::TWrapFlags flags,
5118  const string* prefix, const string* prefix1)
5119 {
5120  CWrapDestStringList d(arr2);
5121  WrapIt(str, width, d, flags, prefix, prefix1);
5122  return arr2;
5123 }
5124 
5125 
5126 list<string>& NStr::WrapList(const list<string>& l, SIZE_TYPE width,
5127  const string& delim, list<string>& arr,
5129  const string* prefix,
5130  const string* prefix1)
5131 {
5132  if (l.empty()) {
5133  return arr;
5134  }
5135 
5136  const string* pfx = prefix1 ? prefix1 : prefix;
5137  string s = *pfx;
5138  bool is_html = flags & fWrap_HTMLPre ? true : false;
5139  SIZE_TYPE column = is_html? s_VisibleHtmlWidth(s) : s.size();
5140  SIZE_TYPE delwidth = is_html? s_VisibleHtmlWidth(delim) : delim.size();
5141  bool at_start = true;
5142 
5143  ITERATE (list<string>, it, l) {
5144  SIZE_TYPE term_width = is_html ? s_VisibleHtmlWidth(*it) : it->size();
5145  if ( at_start ) {
5146  if (column + term_width <= width) {
5147  s += *it;
5148  column += term_width;
5149  at_start = false;
5150  } else {
5151  // Can't fit, even on its own line; break separately.
5152  Wrap(*it, width, arr, flags, prefix, pfx);
5153  pfx = prefix;
5154  s = *prefix;
5155  column = is_html ? s_VisibleHtmlWidth(s) : s.size();
5156  at_start = true;
5157  }
5158  } else if (column + delwidth + term_width <= width) {
5159  s += delim;
5160  s += *it;
5161  column += delwidth + term_width;
5162  at_start = false;
5163  } else {
5164  // Can't fit on this line; break here and try again.
5165  arr.push_back(s);
5166  pfx = prefix;
5167  s = *prefix;
5168  column = is_html ? s_VisibleHtmlWidth(s) : s.size();
5169  at_start = true;
5170  --it;
5171  }
5172  }
5173  arr.push_back(s);
5174  return arr;
5175 }
5176 
5177 
5178 list<string>& NStr::Justify(const CTempString str,
5179  SIZE_TYPE width,
5180  list<string>& par,
5181  const CTempString* pfx,
5182  const CTempString* pfx1)
5183 {
5184  static const CTempString kNothing;
5185  if (!pfx)
5186  pfx = &kNothing;
5187  const CTempString* p = pfx1 ? pfx1 : pfx;
5188 
5189  SIZE_TYPE pos = 0;
5190  for (SIZE_TYPE len = p->size(); pos < str.size(); len = p->size()) {
5191  list<CTempString> words;
5192  unsigned int nw = 0; // How many words are there in the line
5193  bool big = false;
5194  do {
5195  while (pos < str.size()) {
5196  if (!isspace((unsigned char) str[pos]))
5197  break;
5198  ++pos;
5199  }
5200  SIZE_TYPE start = pos;
5201  while (pos < str.size()) {
5202  if ( isspace((unsigned char) str[pos]))
5203  break;
5204  ++pos;
5205  }
5206  SIZE_TYPE wlen = pos - start;
5207  if (!wlen)
5208  break;
5209  if (width < len + nw + wlen) {
5210  if (nw) {
5211  if (width < wlen && len < width - len)
5212  big = true; // Big word is coming, no space stretch
5213  pos = start; // Will have to rescan this word again
5214  break;
5215  }
5216  big = true; // Long line with a long lonely word :-/
5217  }
5218  words.push_back(CTempString(str, start, wlen));
5219  len += wlen;
5220  ++nw;
5221  if (str[pos - 1] == '.' ||
5222  str[pos - 1] == '!' ||
5223  str[pos - 1] == '?') {
5224  if (len + 1 >= width)
5225  break;
5226  words.push_back(CTempString("", 0));
5227  _ASSERT(!big);
5228  nw++;
5229  }
5230  } while (!big);
5231  if (!nw)
5232  break;
5233  if (words.back().empty()) {
5234  words.pop_back();
5235  _ASSERT(nw > 1);
5236  nw--;
5237  }
5238  SIZE_TYPE space;
5239  if (nw > 1) {
5240  if (pos < str.size() && len < width && !big) {
5241  space = (width - len) / (nw - 1);
5242  nw = (unsigned int)((width - len) % (nw - 1));
5243  } else {
5244  space = 1;
5245  nw = 0;
5246  }
5247  } else
5248  space = 0;
5249  par.push_back(*p);
5250  unsigned int n = 0;
5251  ITERATE(list<CTempString>, w, words) {
5252  if (n)
5253  par.back().append(space + (n <= nw ? 1 : 0) , ' ');
5254  par.back().append(w->data(), w->size());
5255  ++n;
5256  }
5257  p = pfx;
5258  }
5259  return par;
5260 }
5261 
5262 
5263 #if !defined(HAVE_STRDUP)
5264 extern char* strdup(const char* str)
5265 {
5266  if ( !str ) {
5267  return 0;
5268  }
5269  size_t size = strlen(str) + 1;
5270  void* result = malloc(size);
5271  return (char*)(result ? memcpy(result, str, size) : 0);
5272 }
5273 #endif
5274 
5275 
5276 static const char s_Encode[256][4] = {
5277  "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
5278  "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
5279  "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
5280  "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
5281  "+", "!", "%22", "%23", "$", "%25", "%26", "'",
5282  "(", ")", "*", "%2B", ",", "-", ".", "%2F",
5283  "0", "1", "2", "3", "4", "5", "6", "7",
5284  "8", "9", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
5285  "%40", "A", "B", "C", "D", "E", "F", "G",
5286  "H", "I", "J", "K", "L", "M", "N", "O",
5287  "P", "Q", "R", "S", "T", "U", "V", "W",
5288  "X", "Y", "Z", "%5B", "%5C", "%5D", "%5E", "_",
5289  "%60", "a", "b", "c", "d", "e", "f", "g",
5290  "h", "i", "j", "k", "l", "m", "n", "o",
5291  "p", "q", "r", "s", "t", "u", "v", "w",
5292  "x", "y", "z", "%7B", "%7C", "%7D", "%7E", "%7F",
5293  "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
5294  "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
5295  "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
5296  "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
5297  "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
5298  "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
5299  "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
5300  "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
5301  "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
5302  "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
5303  "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
5304  "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
5305  "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
5306  "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
5307  "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
5308  "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
5309 };
5310 
5311 static const char s_EncodeMarkChars[256][4] = {
5312  "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
5313  "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
5314  "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
5315  "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
5316  "+", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
5317  "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F",
5318  "0", "1", "2", "3", "4", "5", "6", "7",
5319  "8", "9", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
5320  "%40", "A", "B", "C", "D", "E", "F", "G",
5321  "H", "I", "J", "K", "L", "M", "N", "O",
5322  "P", "Q", "R", "S", "T", "U", "V", "W",
5323  "X", "Y", "Z", "%5B", "%5C", "%5D", "%5E", "%5F",
5324  "%60", "a", "b", "c", "d", "e", "f", "g",
5325  "h", "i", "j", "k", "l", "m", "n", "o",
5326  "p", "q", "r", "s", "t", "u", "v", "w",
5327  "x", "y", "z", "%7B", "%7C", "%7D", "%7E", "%7F",
5328  "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
5329  "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
5330  "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
5331  "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
5332  "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
5333  "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
5334  "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
5335  "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
5336  "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
5337  "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
5338  "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
5339  "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
5340  "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
5341  "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
5342  "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
5343  "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
5344 };
5345 
5346 static const char s_EncodePercentOnly[256][4] = {
5347  "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
5348  "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
5349  "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
5350  "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
5351  "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
5352  "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F",
5353  "0", "1", "2", "3", "4", "5", "6", "7",
5354  "8", "9", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
5355  "%40", "A", "B", "C", "D", "E", "F", "G",
5356  "H", "I", "J", "K", "L", "M", "N", "O",
5357  "P", "Q", "R", "S", "T", "U", "V", "W",
5358  "X", "Y", "Z", "%5B", "%5C", "%5D", "%5E", "%5F",
5359  "%60", "a", "b", "c", "d", "e", "f", "g",
5360  "h", "i", "j", "k", "l", "m", "n", "o",
5361  "p", "q", "r", "s", "t", "u", "v", "w",
5362  "x", "y", "z", "%7B", "%7C", "%7D", "%7E", "%7F",
5363  "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
5364  "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
5365  "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
5366  "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
5367  "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
5368  "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
5369  "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
5370  "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
5371  "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
5372  "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
5373  "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
5374  "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
5375  "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
5376  "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
5377  "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
5378  "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
5379 };
5380 
5381 static const char s_EncodePath[256][4] = {
5382  "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
5383  "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
5384  "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
5385  "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
5386  "+", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
5387  "%28", "%29", "%2A", "%2B", "%2C", "%2D", ".", "/",
5388  "0", "1", "2", "3", "4", "5", "6", "7",
5389  "8", "9", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
5390  "%40", "A", "B", "C", "D", "E", "F", "G",
5391  "H", "I", "J", "K", "L", "M", "N", "O",
5392  "P", "Q", "R", "S", "T", "U", "V", "W",
5393  "X", "Y", "Z", "%5B", "%5C", "%5D", "%5E", "_",
5394  "%60", "a", "b", "c", "d", "e", "f", "g",
5395  "h", "i", "j", "k", "l", "m", "n", "o",
5396  "p", "q", "r", "s", "t", "u", "v", "w",
5397  "x", "y", "z", "%7B", "%7C", "%7D", "%7E", "%7F",
5398  "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
5399  "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
5400  "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
5401  "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
5402  "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
5403  "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
5404  "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
5405  "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
5406  "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
5407  "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
5408  "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
5409  "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
5410  "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
5411  "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
5412  "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
5413  "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
5414 };
5415 
5416 // RFC-2396:
5417 // scheme = alpha *( alpha | digit | "+" | "-" | "." )
5418 static const char s_EncodeURIScheme[256][4] = {
5419  "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
5420  "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
5421  "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
5422  "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
5423  "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
5424  "%28", "%29", "%2A", "+", "%2C", "-", ".", "%2F",
5425  "0", "1", "2", "3", "4", "5", "6", "7",
5426  "8", "9", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
5427  "%40", "A", "B", "C", "D", "E", "F", "G",
5428  "H", "I", "J", "K", "L", "M", "N", "O",
5429  "P", "Q", "R", "S", "T", "U", "V", "W",
5430  "X", "Y", "Z", "%5B", "%5C", "%5D", "%5E", "%5F",
5431  "%60", "a", "b", "c", "d", "e", "f", "g",
5432  "h", "i", "j", "k", "l", "m", "n", "o",
5433  "p", "q", "r", "s", "t", "u", "v", "w",
5434  "x", "y", "z", "%7B", "%7C", "%7D", "%7E", "%7F",
5435  "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
5436  "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
5437  "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
5438  "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
5439  "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
5440  "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
5441  "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
5442  "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
5443  "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
5444  "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
5445  "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
5446  "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
5447  "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
5448  "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
5449  "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
5450  "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
5451 };
5452 
5453 // RFC-2396:
5454 // userinfo = *( unreserved | escaped |
5455 // ";" | ":" | "&" | "=" | "+" | "$" | "," )
5456 // unreserved = alphanum | mark
5457 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
5458 // Note: ":" is name/password separator, so it must be encoded in each of them.
5459 static const char s_EncodeURIUserinfo[256][4] = {
5460  "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
5461  "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
5462  "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
5463  "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
5464  "%20", "!", "%22", "%23", "$", "%25", "&", "'",
5465  "(", ")", "*", "+", ",", "-", ".", "%2F",
5466  "0", "1", "2", "3", "4", "5", "6", "7",
5467  "8", "9", "%3A", ";", "%3C", "=", "%3E", "%3F",
5468  "%40", "A", "B", "C", "D", "E", "F", "G",
5469  "H", "I", "J", "K", "L", "M", "N", "O",
5470  "P", "Q", "R", "S", "T", "U", "V", "W",
5471  "X", "Y", "Z", "%5B", "%5C", "%5D", "%5E", "_",
5472  "%60", "a", "b", "c", "d", "e", "f", "g",
5473  "h", "i", "j", "k", "l", "m", "n", "o",
5474  "p", "q", "r", "s", "t", "u", "v", "w",
5475  "x", "y", "z", "%7B", "%7C", "%7D", "~", "%7F",
5476  "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
5477  "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
5478  "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
5479  "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
5480  "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
5481  "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
5482  "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
5483  "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
5484  "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
5485  "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
5486  "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
5487  "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
5488  "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
5489  "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
5490  "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
5491  "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
5492 };
5493 
5494 // RFC-2396:
5495 // host = hostname | IPv4address
5496 // hostname = *( domainlabel "." ) toplabel [ "." ]
5497