src/corelib/ncbistr.cpp

Go to the documentation of this file.
00001 /*  $Id: ncbistr.cpp 172519 2009-10-06 14:23:09Z vasilche $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *   Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Authors:  Eugene Vasilchenko, Denis Vakatov
00027  *
00028  * File Description:
00029  *   Some helper functions
00030  *
00031  */
00032 
00033 #include <ncbi_pch.hpp>
00034 #include <common/ncbi_source_ver.h>
00035 #include <corelib/ncbistr.hpp>
00036 #include <corelib/tempstr.hpp>
00037 #include <corelib/ncbi_limits.hpp>
00038 #include <corelib/ncbistr_util.hpp>
00039 #include <corelib/error_codes.hpp>
00040 #include <memory>
00041 #include <algorithm>
00042 #include <errno.h>
00043 #include <stdio.h>
00044 
00045 
00046 #define NCBI_USE_ERRCODE_X   Corelib_Util
00047 
00048 
00049 BEGIN_NCBI_SCOPE
00050 
00051 
00052 // Hex symbols (upt ot base 36)
00053 static const char s_Hex[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
00054 
00055 
00056 inline SIZE_TYPE s_DiffPtr(const char* end, const char* start)
00057 {
00058     return end ? (SIZE_TYPE)(end - start) : (SIZE_TYPE) 0;
00059 }
00060 
00061 const char *const kEmptyCStr = "";
00062 
00063 
00064 extern const char* const kNcbiDevelopmentVersionString;
00065 const char* const kNcbiDevelopmentVersionString
00066     = "NCBI_DEVELOPMENT_VER_" NCBI_AS_STRING(NCBI_DEVELOPMENT_VER);
00067 
00068 #ifdef NCBI_PRODUCTION_VER
00069 extern const char* const kNcbiProductionVersionString;
00070 const char* const kNcbiProductionVersionString
00071     = "NCBI_PRODUCTION_VER_" NCBI_AS_STRING(NCBI_PRODUCTION_VER);
00072 #endif
00073 
00074 
00075 #if !defined(NCBI_OS_MSWIN) && !( defined(NCBI_OS_LINUX)  &&  defined(NCBI_COMPILER_GCC) )
00076 const string* CNcbiEmptyString::m_Str = 0;
00077 const string& CNcbiEmptyString::FirstGet(void) {
00078     static const string s_Str = "";
00079     m_Str = &s_Str;
00080     return s_Str;
00081 }
00082 #endif
00083 
00084 
00085 bool NStr::IsBlank(const string& str, SIZE_TYPE pos)
00086 {
00087     SIZE_TYPE len = str.length();
00088     for (SIZE_TYPE idx = pos; idx < len; ++idx) {
00089         if (!isspace((unsigned char) str[idx])) {
00090             return false;
00091         }
00092     }
00093     return true;
00094 }
00095 
00096 
00097 int NStr::CompareCase(const string& str, SIZE_TYPE pos, SIZE_TYPE n,
00098                       const char* pattern)
00099 {
00100     if (pos == NPOS  ||  !n  ||  str.length() <= pos) {
00101         return *pattern ? -1 : 0;
00102     }
00103     if ( !*pattern ) {
00104         return 1;
00105     }
00106 
00107     if (n == NPOS  ||  n > str.length() - pos) {
00108         n = str.length() - pos;
00109     }
00110 
00111     const char* s = str.data() + pos;
00112     while (n  &&  *pattern  &&  *s == *pattern) {
00113         s++;  pattern++;  n--;
00114     }
00115 
00116     if (n == 0) {
00117         return *pattern ? -1 : 0;
00118     }
00119 
00120     return *s - *pattern;
00121 }
00122 
00123 
00124 int NStr::CompareNocase(const string& str, SIZE_TYPE pos, SIZE_TYPE n,
00125                         const char* pattern)
00126 {
00127     if (pos == NPOS  ||  !n  ||  str.length() <= pos) {
00128         return *pattern ? -1 : 0;
00129     }
00130     if ( !*pattern ) {
00131         return 1;
00132     }
00133 
00134     if (n == NPOS  ||  n > str.length() - pos) {
00135         n = str.length() - pos;
00136     }
00137 
00138     const char* s = str.data() + pos;
00139     while (n  &&  *pattern  &&
00140            tolower((unsigned char)(*s)) == 
00141            tolower((unsigned char)(*pattern))) {
00142         s++;  pattern++;  n--;
00143     }
00144 
00145     if (n == 0) {
00146         return *pattern ? -1 : 0;
00147     }
00148 
00149     return tolower((unsigned char)(*s)) - tolower((unsigned char)(*pattern));
00150 }
00151 
00152 
00153 int NStr::CompareCase(const string& str, SIZE_TYPE pos, SIZE_TYPE n,
00154                       const string& pattern)
00155 {
00156     if (pos == NPOS  ||  !n  ||  str.length() <= pos) {
00157         return pattern.empty() ? 0 : -1;
00158     }
00159     if (pattern.empty()) {
00160         return 1;
00161     }
00162 
00163     if (n == NPOS  ||  n > str.length() - pos) {
00164         n = str.length() - pos;
00165     }
00166 
00167     SIZE_TYPE n_cmp = n;
00168     if (n_cmp > pattern.length()) {
00169         n_cmp = pattern.length();
00170     }
00171     const char* s = str.data() + pos;
00172     const char* p = pattern.data();
00173     while (n_cmp  &&  *s == *p) {
00174         s++;  p++;  n_cmp--;
00175     }
00176 
00177     if (n_cmp == 0) {
00178         if (n == pattern.length())
00179             return 0;
00180         return n > pattern.length() ? 1 : -1;
00181     }
00182 
00183     return *s - *p;
00184 }
00185 
00186 
00187 int NStr::CompareNocase(const string& str, SIZE_TYPE pos, SIZE_TYPE n,
00188                         const string& pattern)
00189 {
00190     if (pos == NPOS  ||  !n  ||  str.length() <= pos) {
00191         return pattern.empty() ? 0 : -1;
00192     }
00193     if (pattern.empty()) {
00194         return 1;
00195     }
00196 
00197     if (n == NPOS  ||  n > str.length() - pos) {
00198         n = str.length() - pos;
00199     }
00200 
00201     SIZE_TYPE n_cmp = n;
00202     if (n_cmp > pattern.length()) {
00203         n_cmp = pattern.length();
00204     }
00205     const char* s = str.data() + pos;
00206     const char* p = pattern.data();
00207     while (n_cmp  &&  
00208            tolower((unsigned char)(*s)) == tolower((unsigned char)(*p))) {
00209         s++;  p++;  n_cmp--;
00210     }
00211 
00212     if (n_cmp == 0) {
00213         if (n == pattern.length())
00214             return 0;
00215         return n > pattern.length() ? 1 : -1;
00216     }
00217 
00218     return tolower((unsigned char)(*s)) - tolower((unsigned char)(*p));
00219 }
00220 
00221 
00222 // NOTE: This code is used also in the CDirEntry::MatchesMask.
00223 
00224 bool NStr::MatchesMask(const char* str, const char* mask, ECase use_case) 
00225 {
00226     char c;
00227     bool infinite = true;
00228 
00229     while (infinite) {
00230         // Analyze symbol in mask
00231         switch ( c = *mask++ ) {
00232         
00233         case '\0':
00234             return *str == '\0';
00235 
00236         case '?':
00237             if (*str == '\0') {
00238                 return false;
00239             }
00240             ++str;
00241             break;
00242 
00243         case '*':
00244             c = *mask;
00245             // Collapse multiple stars
00246             while ( c == '*' ) {
00247                 c = *++mask;
00248             }
00249             if (c == '\0') {
00250                 return true;
00251             }
00252             // General case, use recursion
00253             while ( *str ) {
00254                 if (MatchesMask(str, mask, use_case)) {
00255                     return true;
00256                 }
00257                 ++str;
00258             }
00259             return false;
00260 
00261         default:
00262             // Compare nonpattern character in mask and name
00263             char s = *str++;
00264             if (use_case == eNocase) {
00265                 c = tolower((unsigned char) c);
00266                 s = tolower((unsigned char) s);
00267             }
00268             if (c != s) {
00269                 return false;
00270             }
00271             break;
00272         }
00273     }
00274     return false;
00275 }
00276 
00277 
00278 char* NStr::ToLower(char* str)
00279 {
00280     char* s;
00281     for (s = str;  *str;  str++) {
00282         *str = tolower((unsigned char)(*str));
00283     }
00284     return s;
00285 }
00286 
00287 
00288 string& NStr::ToLower(string& str)
00289 {
00290     NON_CONST_ITERATE (string, it, str) {
00291         *it = tolower((unsigned char)(*it));
00292     }
00293     return str;
00294 }
00295 
00296 
00297 char* NStr::ToUpper(char* str)
00298 {
00299     char* s;
00300     for (s = str;  *str;  str++) {
00301         *str = toupper((unsigned char)(*str));
00302     }
00303     return s;
00304 }
00305 
00306 
00307 string& NStr::ToUpper(string& str)
00308 {
00309     NON_CONST_ITERATE (string, it, str) {
00310         *it = toupper((unsigned char)(*it));
00311     }
00312     return str;
00313 }
00314 
00315 
00316 int NStr::StringToNumeric(const string& str)
00317 {
00318     if ( str.empty()  ||
00319          (!isdigit((unsigned char)(*str.begin())) &  (*str.begin() != '+')) ) {
00320         errno = EINVAL;
00321         return -1;
00322     }
00323     char* endptr = 0;
00324     const char* begptr = str.c_str();
00325     errno = 0;
00326     unsigned long value = strtoul(begptr, &endptr, 10);
00327     if ( errno  ||  !endptr  ||  endptr == begptr  ||
00328         value > (unsigned long) kMax_Int  ||  *endptr ) {
00329         if ( !errno ) {
00330             errno = !endptr || endptr == begptr || *endptr ? EINVAL : ERANGE;
00331         }
00332         return -1;
00333     }
00334     return (int) value;
00335 }
00336 
00337 
00338 #define S2N_CONVERT_ERROR(to_type, msg, errcode, force_errno, delta)        \
00339         if (flags & NStr::fConvErr_NoThrow)  {                              \
00340             if ( force_errno )                                              \
00341                 errno = 0;                                                  \
00342             if ( !errno )                                                   \
00343                 errno = errcode;                                            \
00344             /* ignore previosly converted value -- always return zero */    \
00345             return 0;                                                       \
00346         } else {                                                            \
00347             CTempString str_tmp(str);                                       \
00348             CTempString msg_tmp(msg);                                       \
00349             string smsg;                                                    \
00350             smsg.reserve(str_tmp.length() + msg_tmp.length() + 50);         \
00351             smsg += "Cannot convert string '";                              \
00352             smsg += str;                                                    \
00353             smsg += "' to " #to_type;                                       \
00354             if ( !msg_tmp.empty() ) {                                       \
00355                 smsg += ", ";                                               \
00356                 smsg += msg;                                                \
00357             }                                                               \
00358             NCBI_THROW2(CStringException, eConvert, smsg, delta);           \
00359         }                                                                   \
00360 
00361 #define S2N_CONVERT_ERROR_INVAL(to_type)                                    \
00362     S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, true, pos)
00363 
00364 #define S2N_CONVERT_ERROR_RADIX(to_type, msg)                               \
00365     S2N_CONVERT_ERROR(to_type, msg, EINVAL, true, pos)
00366 
00367 #define S2N_CONVERT_ERROR_OVERFLOW(to_type)                                 \
00368     S2N_CONVERT_ERROR(to_type, "overflow",ERANGE, true, pos)
00369 
00370 #define CHECK_ENDPTR(to_type)                                               \
00371     if ( str[pos] ) {                                                       \
00372         S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, true, pos);           \
00373     }
00374 
00375 #define CHECK_RANGE(nmin, nmax, to_type)                                    \
00376     if ( errno  ||  value < nmin  ||  value > nmax ) {                      \
00377         S2N_CONVERT_ERROR(to_type, "overflow", ERANGE, false, 0);           \
00378     }
00379 
00380 #define CHECK_RANGE_U(nmax, to_type)                                        \
00381     if ( errno  ||  value > nmax ) {                                        \
00382         S2N_CONVERT_ERROR(to_type, "overflow", ERANGE, false, 0);           \
00383     }
00384 
00385 #define CHECK_COMMAS                                                        \
00386     /* Check on possible commas */                                          \
00387     if (flags & NStr::fAllowCommas) {                                       \
00388         if (ch == ',') {                                                    \
00389             if ((numpos == pos)  ||                                         \
00390                 ((comma >= 0)  &&  (comma != 3)) ) {                        \
00391                 /* Not first comma, sitting on incorrect place */           \
00392                 break;                                                      \
00393             }                                                               \
00394             /* Skip it */                                                   \
00395             comma = 0;                                                      \
00396             pos++;                                                          \
00397             continue;                                                       \
00398         } else {                                                            \
00399             if (comma >= 0) {                                               \
00400                 /* Count symbols between commas */                          \
00401                 comma++;                                                    \
00402             }                                                               \
00403         }                                                                   \
00404     }
00405 
00406 
00407 int NStr::StringToInt(const CTempString& str, TStringToNumFlags flags,int base)
00408 {
00409     errno = 0;
00410     Int8 value = StringToInt8(str, flags, base);
00411     CHECK_RANGE(kMin_Int, kMax_Int, int);
00412     return (int) value;
00413 }
00414 
00415 
00416 unsigned int
00417 NStr::StringToUInt(const CTempString& str, TStringToNumFlags flags, int base)
00418 {
00419     errno = 0;
00420     Uint8 value = StringToUInt8(str, flags, base);
00421     CHECK_RANGE_U(kMax_UInt, unsigned int);
00422     return (unsigned int) value;
00423 }
00424 
00425 
00426 long NStr::StringToLong(const CTempString& str, TStringToNumFlags flags,
00427                         int base)
00428 {
00429     errno = 0;
00430     Int8 value = StringToInt8(str, flags, base);
00431     CHECK_RANGE(kMin_Long, kMax_Long, long);
00432     return (long) value;
00433 }
00434 
00435 
00436 unsigned long
00437 NStr::StringToULong(const CTempString& str, TStringToNumFlags flags, int base)
00438 {
00439     errno = 0;
00440     Uint8 value = StringToUInt8(str, flags, base);
00441     CHECK_RANGE_U(kMax_ULong, long);
00442     return (unsigned long) value;
00443 }
00444 
00445 
00446 /// @internal
00447 // Check that symbol 'ch' is good symbol for number with radix 'base'.
00448 bool s_IsGoodCharForRadix(char ch, int base, int* value = 0)
00449 {
00450     if (!isalnum((unsigned char) ch)) {
00451         return false;
00452     }
00453     // Corresponding numeric value of *endptr
00454     int delta;
00455     if (isdigit((unsigned char) ch)) {
00456         delta = ch - '0';
00457     } else {
00458         ch = tolower((unsigned char) ch);
00459         delta = ch - 'a' + 10;
00460     }
00461     if ( value ) {
00462         *value = delta;
00463     }
00464     return delta < base;
00465  }
00466 
00467 
00468 // Skip all allowed chars (all except used for digit composition). 
00469 // Update 'ptr' to current position in the string.
00470 enum ESkipMode {
00471     eSkipAll,           // all symbols
00472     eSkipAllAllowed,    // all symbols, except digit/+/-/.
00473     eSkipSpacesOnly     // spaces only 
00474 };
00475 
00476 void s_SkipAllowedSymbols(const CTempString& str,
00477                           SIZE_TYPE&         pos,
00478                           ESkipMode          skip_mode)
00479 {
00480     if (skip_mode == eSkipAll) {
00481         pos = str.length();
00482         return;
00483     }
00484     for ( SIZE_TYPE len = str.length(); pos < len; ++pos ) {
00485         unsigned char ch = str[pos];
00486         if ( isdigit(ch)  ||  ch == '+' ||  ch == '-'  ||  ch == '.' ) {
00487             break;
00488         }
00489         if ( (skip_mode == eSkipSpacesOnly)  &&  !isspace(ch) ) {
00490             break;
00491         }
00492     }
00493 }
00494 
00495 
00496 // Check radix base. If it is zero, determine base using first chars
00497 // of the string. Update 'base' value.
00498 // Update 'ptr' to current position in the string.
00499 
00500 bool s_CheckRadix(const CTempString& str, SIZE_TYPE& pos, int& base)
00501 {
00502     // Check base
00503     if ( base < 0  ||  base == 1  ||  base > 36 ) {
00504         return false;
00505     }
00506     // Try to determine base using first chars of the string
00507     unsigned char ch   = str[pos];
00508     unsigned char next = str[pos+1];
00509     if ( base == 0 ) {
00510         if ( ch != '0' ) {
00511             base = 10;
00512         } else if (next == 'x' || next == 'X') {
00513             base = 16;
00514         } else {
00515             base = 8;
00516         }
00517     }
00518     // Remove leading '0x' for hex numbers
00519     if ( base == 16 ) {
00520         if (ch == '0'  &&  (next == 'x' || next == 'X')) {
00521             pos += 2;
00522         }
00523     }
00524     return true;
00525 }
00526 
00527 
00528 Int8 NStr::StringToInt8(const CTempString& str, TStringToNumFlags flags,
00529                         int base)
00530 {
00531     _ASSERT(flags == 0  ||  flags > 32);
00532 
00533     // Current position in the string
00534     SIZE_TYPE pos = 0;
00535 
00536     // Skip allowed leading symbols
00537     if (flags & fAllowLeadingSymbols) {
00538         bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
00539         s_SkipAllowedSymbols(str, pos,
00540                              spaces ? eSkipSpacesOnly : eSkipAllAllowed);
00541     }
00542     // Determine sign
00543     bool sign = false;
00544     switch (str[pos]) {
00545     case '-':
00546         sign = true;
00547         /*FALLTHRU*/
00548     case '+':
00549         pos++;
00550         break;
00551     default:
00552         if (flags & fMandatorySign) {
00553             S2N_CONVERT_ERROR_INVAL(Int8);
00554         }
00555         break;
00556     }
00557     // Check radix base
00558     if ( !s_CheckRadix(str, pos, base) ) {
00559         S2N_CONVERT_ERROR_RADIX(Int8, "bad numeric base '" + 
00560                                 NStr::IntToString(base)+ "'");
00561     }
00562 
00563     // Begin conversion
00564     Int8 n = 0;
00565     Int8 limdiv = kMax_I8 / base;
00566     Int8 limoff = kMax_I8 % base + (sign ? 1 : 0);
00567 
00568     // Number of symbols between two commas. '-1' means -- no comma yet.
00569     int       comma  = -1;  
00570     SIZE_TYPE numpos = pos;
00571 
00572     errno = 0;
00573     while (str[pos]) {
00574         char ch = str[pos];
00575         int  delta;   // corresponding numeric value of 'ch'
00576 
00577         // Check on possible commas
00578         CHECK_COMMAS;
00579         // Sanity check
00580         if ( !s_IsGoodCharForRadix(ch, base, &delta) ) {
00581             break;
00582         }
00583         // Overflow check
00584         if ( n > limdiv  ||  (n == limdiv  &&  delta > limoff) ) {
00585             S2N_CONVERT_ERROR_OVERFLOW(Int8);
00586         }
00587         n *= base;
00588         n += delta;
00589         pos++;
00590     }
00591 
00592     // Last checks
00593     if ( !pos  || ((comma >= 0)  &&  (comma != 3)) ) {
00594         S2N_CONVERT_ERROR_INVAL(Int8);
00595     }
00596     // Skip allowed trailing symbols
00597     if (flags & fAllowTrailingSymbols) {
00598         bool spaces = ((flags & fAllowTrailingSymbols) ==
00599                        fAllowTrailingSpaces);
00600         s_SkipAllowedSymbols(str, pos, spaces ? eSkipSpacesOnly : eSkipAll);
00601     }
00602     // Assign sign before the end pointer check
00603     n = sign ? -n : n;
00604     CHECK_ENDPTR(Int8);
00605     return n;
00606 }
00607 
00608 
00609 Uint8 NStr::StringToUInt8(const CTempString& str,
00610                           TStringToNumFlags flags, int base)
00611 {
00612     _ASSERT(flags == 0  ||  flags > 32);
00613 
00614     // Current position in the string
00615     SIZE_TYPE pos = 0;
00616 
00617     // Skip allowed leading symbols
00618     if (flags & fAllowLeadingSymbols) {
00619         bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
00620         s_SkipAllowedSymbols(str, pos,
00621                              spaces ? eSkipSpacesOnly : eSkipAllAllowed);
00622     }
00623     // Determine sign
00624     if (str[pos] == '+') {
00625         pos++;
00626     } else {
00627         if (flags & fMandatorySign) {
00628             S2N_CONVERT_ERROR_INVAL(Uint8);
00629         }
00630     }
00631     // Check radix base
00632     if ( !s_CheckRadix(str, pos, base) ) {
00633         S2N_CONVERT_ERROR_RADIX(Uint8, "bad numeric base '" +
00634                                 NStr::IntToString(base) + "'");
00635     }
00636 
00637     // Begin conversion
00638     Uint8 n = 0;
00639     Uint8 limdiv = kMax_UI8 / base;
00640     int   limoff = int(kMax_UI8 % base);
00641 
00642     // Number of symbols between two commas. '-1' means -- no comma yet.
00643     int       comma  = -1;  
00644     SIZE_TYPE numpos = pos;
00645 
00646     errno = 0;
00647     while (str[pos]) {
00648         char ch = str[pos];
00649         int  delta;         // corresponding numeric value of 'ch'
00650 
00651         // Check on possible commas
00652         CHECK_COMMAS;
00653         // Sanity check
00654         if ( !s_IsGoodCharForRadix(ch, base, &delta) ) {
00655             break;
00656         }
00657         // Overflow check
00658         if (n > limdiv  ||  (n == limdiv  &&  delta > limoff)) {
00659             S2N_CONVERT_ERROR_OVERFLOW(Uint8);
00660         }
00661         n *= base;
00662         n += delta;
00663         pos++;
00664     }
00665 
00666     // Last checks
00667     if ( !pos  || ((comma >= 0)  &&  (comma != 3)) ) {
00668         S2N_CONVERT_ERROR_INVAL(Uint8);
00669     }
00670     // Skip allowed trailing symbols
00671     if (flags & fAllowTrailingSymbols) {
00672         bool spaces = ((flags & fAllowTrailingSymbols) ==
00673                        fAllowTrailingSpaces);
00674         s_SkipAllowedSymbols(str, pos, spaces ? eSkipSpacesOnly : eSkipAll);
00675     }
00676     CHECK_ENDPTR(Uint8);
00677     return n;
00678 }
00679 
00680 
00681 double NStr::StringToDoubleEx(const char* str, size_t size,
00682                               TStringToNumFlags flags)
00683 {
00684     _ASSERT(flags == 0  ||  flags > 32);
00685     _ASSERT(str[size] == '\0');
00686 
00687     // Current position in the string
00688     SIZE_TYPE pos  = 0;
00689 
00690     // Skip allowed leading symbols
00691     if (flags & fAllowLeadingSymbols) {
00692         bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
00693         s_SkipAllowedSymbols(CTempString(str, size), pos,
00694                              spaces ? eSkipSpacesOnly : eSkipAllAllowed);
00695     }
00696     // Check mandatory sign
00697     if (flags & fMandatorySign) {
00698         switch (str[pos]) {
00699         case '-':
00700         case '+':
00701             break;
00702         default:
00703             S2N_CONVERT_ERROR_INVAL(double);
00704         }
00705     }
00706     // For consistency make additional check on incorrect leading symbols.
00707     // Because strtod() may just skip such symbols.
00708     if (!(flags & fAllowLeadingSymbols)) {
00709         char c = str[pos];
00710         if ( !isdigit((unsigned int)c)  &&  c != '.'  &&  c != '-'  &&  c != '+') {
00711             S2N_CONVERT_ERROR_INVAL(double);
00712         }
00713     }
00714 
00715     // Conversion
00716     char* endptr = 0;
00717     const char* begptr = str + pos;
00718 
00719     errno = 0;
00720     double n = strtod(begptr, &endptr);
00721     if ( errno  ||  !endptr  ||  endptr == begptr ) {
00722         S2N_CONVERT_ERROR(double, kEmptyStr, EINVAL, false,
00723                           s_DiffPtr(endptr, begptr) + pos);
00724     }
00725     if ( *(endptr - 1) != '.'  &&  *endptr == '.' ) {
00726         // Only a single dot at the end of line is allowed
00727         if (endptr == strchr(begptr, '.')) {
00728             endptr++;
00729         }
00730     }
00731     pos += s_DiffPtr(endptr, begptr);
00732 
00733     // Skip allowed trailing symbols
00734     if (flags & fAllowTrailingSymbols) {
00735         bool spaces = ((flags & fAllowTrailingSymbols) ==
00736                        fAllowTrailingSpaces);
00737         s_SkipAllowedSymbols(str, pos, spaces ? eSkipSpacesOnly : eSkipAll);
00738     }
00739     CHECK_ENDPTR(double);
00740     return n;
00741 }
00742 
00743 
00744 double NStr::StringToDouble(const CTempStringEx& str, TStringToNumFlags flags)
00745 {
00746     size_t size = str.size();
00747     if ( str.HasZeroAtEnd() ) {
00748         // string has zer at the end already
00749         return StringToDoubleEx(str.data(), size, flags);
00750     }
00751     char buf[256]; // small temporary buffer in stack for appending zero char
00752     if ( size < sizeof(buf) ) {
00753         memcpy(buf, str.data(), size);
00754         buf[size] = '\0';
00755         return StringToDoubleEx(buf, size, flags);
00756     }
00757     else {
00758         // use std::string() to allocate memory for appending zero char
00759         return StringToDoubleEx(string(str).c_str(), size, flags);
00760     }
00761 }
00762 
00763 /// @internal
00764 static Uint8 s_DataSizeConvertQual(const CTempString&      str,
00765                                    SIZE_TYPE&              pos, 
00766                                    Uint8                   value,
00767                                    NStr::TStringToNumFlags flags)
00768 {
00769     unsigned char ch = str[pos];
00770     if ( !ch ) {
00771         return value;
00772     }
00773 
00774     ch = toupper(ch);
00775     Uint8 v   = value;
00776     bool  err = false;
00777 
00778     switch(ch) {
00779     case 'K':
00780         pos++;
00781         if ((kMax_UI8 / 1024) < v) {
00782             err = true;
00783         }
00784         v *= 1024;
00785         break;
00786     case 'M':
00787         pos++;
00788         if ((kMax_UI8 / 1024 / 1024) < v) {
00789             err = true;
00790         }
00791         v *= 1024 * 1024;
00792         break;
00793     case 'G':
00794         pos++;
00795         if ((kMax_UI8 / 1024 / 1024 / 1024) < v) {
00796             err = true;
00797         }
00798         v *= 1024 * 1024 * 1024;
00799         break;
00800     default:
00801         // error -- the "qual" points to the last unprocessed symbol
00802         S2N_CONVERT_ERROR_INVAL(Uint8);
00803     }
00804     if ( err ) {
00805         S2N_CONVERT_ERROR_OVERFLOW(DataSize);
00806     }
00807 
00808     ch = str[pos];
00809     if ( ch  &&  toupper(ch) == 'B' ) {
00810         pos++;
00811     }
00812     return v;
00813 }
00814 
00815 
00816 Uint8 NStr::StringToUInt8_DataSize(const CTempString& str, 
00817                                    TStringToNumFlags  flags, 
00818                                    int                base)
00819 {
00820     // We have a limited base range here
00821     _ASSERT(flags == 0  ||  flags > 20);
00822 
00823     // Current position in the string
00824     SIZE_TYPE pos = 0;
00825 
00826     // Find end of number representation
00827     {{
00828         // Skip allowed leading symbols
00829         if (flags & fAllowLeadingSymbols) {
00830             bool spaces = ((flags & fAllowLeadingSymbols) ==
00831                            fAllowLeadingSpaces);
00832             s_SkipAllowedSymbols(str, pos,
00833                            spaces ? eSkipSpacesOnly : eSkipAllAllowed);
00834         }
00835         // Determine sign
00836         if (str[pos] == '+') {
00837             pos++;
00838             // strip fMandatorySign flag
00839             flags &= ~fMandatorySign;
00840         } else {
00841             if (flags & fMandatorySign) {
00842                 S2N_CONVERT_ERROR_INVAL(Uint8);
00843             }
00844         }
00845         // Check radix base
00846         if ( !s_CheckRadix(str, pos, base) ) {
00847             S2N_CONVERT_ERROR_RADIX(Uint8, "bad numeric base '" +
00848                                     NStr::IntToString(base) + "'");
00849         }
00850     }}
00851 
00852     SIZE_TYPE numpos = pos;
00853     char ch = str[pos];
00854     while (ch) {
00855         if ( !s_IsGoodCharForRadix(ch, base)  &&
00856              ((ch != ',')  ||  !(flags & fAllowCommas)) ) {
00857             break;
00858         }
00859         ch = str[++pos];
00860     }
00861     // If string is empty, just use whole remaining string for conversion
00862     // (for correct error reporting)
00863     if (pos-numpos == 0) {
00864         pos = str.length();
00865     }
00866 
00867     // Convert to number
00868     Uint8 n = StringToUInt8(CTempString(str.data()+numpos, pos-numpos),
00869                             flags, base);
00870     if ( errno ) {
00871         // If exceptions enabled by flags that it has been already thrown.
00872         // errno is also set, so return a zero.
00873         return 0;
00874     }
00875     // Check trailer (KB, MB, ...)
00876     if ( ch ) {
00877         n = s_DataSizeConvertQual(str, pos, n, flags);
00878     }
00879     // Skip allowed trailing symbols
00880     if (flags & fAllowTrailingSymbols) {
00881         bool spaces = ((flags & fAllowTrailingSymbols) ==
00882                        fAllowTrailingSpaces);
00883         s_SkipAllowedSymbols(str, pos, spaces ? eSkipSpacesOnly : eSkipAll);
00884     }
00885     CHECK_ENDPTR(Uint8);
00886     return n;
00887 }
00888 
00889 
00890 void NStr::IntToString(string& out_str, long svalue,
00891                        TNumToStringFlags flags, int base)
00892 {
00893     _ASSERT(flags == 0  ||  flags > 32);
00894     if ( base < 2  ||  base > 36 ) {
00895         return;
00896     }
00897 
00898     unsigned long value = static_cast<unsigned long>(svalue);
00899     
00900     const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
00901     char  buffer[kBufSize];
00902     char* pos = buffer + kBufSize;
00903     
00904     if ( base == 10 ) {
00905         if ( svalue < 0 ) {
00906             value = static_cast<unsigned long>(-svalue);
00907         }
00908         
00909         if ( (flags & fWithCommas) ) {
00910             int cnt = -1;
00911             do {
00912                 if (++cnt == 3) {
00913                     *--pos = ',';
00914                     cnt = 0;
00915                 }
00916                 unsigned long a = '0'+value;
00917                 value /= 10;
00918                 *--pos = char(a - value*10);
00919             } while ( value );
00920         }
00921         else {
00922             do {
00923                 unsigned long a = '0'+value;
00924                 value /= 10;
00925                 *--pos = char(a - value*10);
00926             } while ( value );
00927         }
00928 
00929         if (svalue < 0)
00930             *--pos = '-';
00931         else if (flags & fWithSign)
00932             *--pos = '+';
00933     }
00934     else if ( base == 16 ) {
00935         do {
00936             *--pos = s_Hex[value % 16];
00937             value /= 16;
00938         } while ( value );
00939     }
00940     else {
00941         do {
00942             *--pos = s_Hex[value % base];
00943             value /= base;
00944         } while ( value );
00945     }
00946 
00947     out_str.assign(pos, buffer + kBufSize - pos);
00948 }
00949 
00950 
00951 void NStr::UIntToString(string&           out_str,
00952                         unsigned long     value,
00953                         TNumToStringFlags flags,
00954                         int               base)
00955 {
00956     _ASSERT(flags == 0  ||  flags > 32);
00957     if ( base < 2  ||  base > 36 ) {
00958         return;
00959     }
00960 
00961     const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
00962     char  buffer[kBufSize];
00963     char* pos = buffer + kBufSize;
00964 
00965     if ( base == 10 ) {
00966         if ( (flags & fWithCommas) ) {
00967             int cnt = -1;
00968             do {
00969                 if (++cnt == 3) {
00970                     *--pos = ',';
00971                     cnt = 0;
00972                 }
00973                 unsigned long a = '0'+value;
00974                 value /= 10;
00975                 *--pos = char(a - value*10);
00976             } while ( value );
00977         }
00978         else {
00979             do {
00980                 unsigned long a = '0'+value;
00981                 value /= 10;
00982                 *--pos = char(a - value*10);
00983             } while ( value );
00984         }
00985 
00986         if ( (flags & fWithSign) ) {
00987             *--pos = '+';
00988         }
00989     }
00990     else if ( base == 16 ) {
00991         do {
00992             *--pos = s_Hex[value % 16];
00993             value /= 16;
00994         } while ( value );
00995     }
00996     else {
00997         do {
00998             *--pos = s_Hex[value % base];
00999             value /= base;
01000         } while ( value );
01001     }
01002 
01003     out_str.assign(pos, buffer + kBufSize - pos);
01004 }
01005 
01006 
01007 string NStr::Int8ToString(Int8 value, TNumToStringFlags flags, int base)
01008 {
01009     string ret;
01010     NStr::Int8ToString(ret, value, flags, base);
01011     return ret;
01012 }
01013 
01014 
01015 // On some platforms division of Int8 is very slow,
01016 // so will try to optimize it working with chunks.
01017 // Works only for radix base == 10.
01018 
01019 #define PRINT_INT8_CHUNK 1000000000
01020 #define PRINT_INT8_CHUNK_SIZE 9
01021 
01022 /// @internal
01023 static char* s_PrintUint8(char*                   pos,
01024                           Uint8                   value,
01025                           NStr::TNumToStringFlags flags,
01026                           int                     base)
01027 {
01028     if ( base == 10 ) {
01029         if ( (flags & NStr::fWithCommas) ) {
01030             int cnt = -1;
01031 #ifdef PRINT_INT8_CHUNK
01032             // while n doesn't fit in Uint4 process the number
01033             // by 9-digit chunks within 32-bit Uint4
01034             while ( value & ~Uint8(Uint4(~0)) ) {
01035                 Uint4 chunk = Uint4(value);
01036                 value /= PRINT_INT8_CHUNK;
01037                 chunk -= PRINT_INT8_CHUNK*Uint4(value);
01038                 char* end = pos - PRINT_INT8_CHUNK_SIZE - 2; // 9-digit chunk should have 2 commas
01039                 do {
01040                     if (++cnt == 3) {
01041                         *--pos = ',';
01042                         cnt = 0;
01043                     }
01044                     Uint4 a = '0'+chunk;
01045                     chunk /= 10;
01046                     *--pos = char(a-10*chunk);
01047                 } while ( pos != end );
01048             }
01049             // process all remaining digits in 32-bit number
01050             Uint4 chunk = Uint4(value);
01051             do {
01052                 if (++cnt == 3) {
01053                     *--pos = ',';
01054                     cnt = 0;
01055                 }
01056                 Uint4 a = '0'+chunk;
01057                 chunk /= 10;
01058                 *--pos = char(a-10*chunk);
01059             } while ( chunk );
01060 #else
01061             do {
01062                 if (++cnt == 3) {
01063                     *--pos = ',';
01064                     cnt = 0;
01065                 }
01066                 Uint8 a = '0'+value;
01067                 value /= 10;
01068                 *--pos = char(a - 10*value);
01069             } while ( value );
01070 #endif
01071         }
01072         else {
01073 #ifdef PRINT_INT8_CHUNK
01074             // while n doesn't fit in Uint4 process the number
01075             // by 9-digit chunks within 32-bit Uint4
01076             while ( value & ~Uint8(Uint4(~0)) ) {
01077                 Uint4 chunk = Uint4(value);
01078                 value /= PRINT_INT8_CHUNK;
01079                 chunk -= PRINT_INT8_CHUNK*Uint4(value);
01080                 char* end = pos - PRINT_INT8_CHUNK_SIZE;
01081                 do {
01082                     Uint4 a = '0'+chunk;
01083                     chunk /= 10;
01084                     *--pos = char(a-10*chunk);
01085                 } while ( pos != end );
01086             }
01087             // process all remaining digits in 32-bit number
01088             Uint4 chunk = Uint4(value);
01089             do {
01090                 Uint4 a = '0'+chunk;
01091                 chunk /= 10;
01092                 *--pos = char(a-10*chunk);
01093             } while ( chunk );
01094 #else
01095             do {
01096                 Uint8 a = '0'+value;
01097                 value /= 10;
01098                 *--pos = char(a-10*value);
01099             } while ( value );
01100 #endif
01101         }
01102     }
01103     else if ( base == 16 ) {
01104         do {
01105             *--pos = s_Hex[value % 16];
01106             value /= 16;
01107         } while ( value );
01108     }
01109     else {
01110         do {
01111             *--pos = s_Hex[value % base];
01112             value /= base;
01113         } while ( value );
01114     }
01115     return pos;
01116 }
01117 
01118 
01119 void NStr::Int8ToString(string& out_str, Int8 svalue,
01120                         TNumToStringFlags flags, int base)
01121 {
01122     _ASSERT(flags == 0  ||  flags > 32);
01123     if ( base < 2  ||  base > 36 ) {
01124         return;
01125     }
01126 
01127     Uint8 value;
01128     if (base == 10) {
01129         value = static_cast<Uint8>(svalue<0?-svalue:svalue);
01130     } else {
01131         value = static_cast<Uint8>(svalue);
01132     }
01133 
01134     const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
01135     char  buffer[kBufSize];
01136 
01137     char* pos = s_PrintUint8(buffer + kBufSize, value, flags, base);
01138 
01139     if (base == 10) {
01140         if (svalue < 0)
01141             *--pos = '-';
01142         else if (flags & fWithSign)
01143             *--pos = '+';
01144     }
01145     out_str.assign(pos, buffer + kBufSize - pos);
01146 }
01147 
01148 
01149 string NStr::UInt8ToString(Uint8 value, TNumToStringFlags flags, int base)
01150 {
01151     string ret;
01152     NStr::UInt8ToString(ret, value, flags, base);
01153     return ret;
01154 }
01155 
01156 
01157 void NStr::UInt8ToString(string& out_str, Uint8 value,
01158                          TNumToStringFlags flags, int base)
01159 {
01160     _ASSERT(flags == 0  ||  flags > 32);
01161     if ( base < 2  ||  base > 36 ) {
01162         return;
01163     }
01164 
01165     const SIZE_TYPE kBufSize = CHAR_BIT  * sizeof(value);
01166     char  buffer[kBufSize];
01167 
01168     char* pos = s_PrintUint8(buffer + kBufSize, value, flags, base);
01169 
01170     if ( (base == 10)  &&  (flags & fWithSign) ) {
01171         *--pos = '+';
01172     }
01173     out_str.assign(pos, buffer + kBufSize - pos);
01174 }
01175 
01176 
01177 // A maximal double precision used in the double to string conversion
01178 #if defined(NCBI_OS_MSWIN)
01179     const int kMaxDoublePrecision = 200;
01180 #else
01181     const int kMaxDoublePrecision = 308;
01182 #endif
01183 // A maximal size of a double value in a string form.
01184 // Exponent size + sign + dot + ending '\0' + max.precision
01185 const int kMaxDoubleStringSize = 308 + 3 + kMaxDoublePrecision;
01186 
01187 
01188 string NStr::DoubleToString(double value, int precision,
01189                             TNumToStringFlags flags)
01190 {
01191     string str;
01192     DoubleToString(str, value, precision, flags);
01193     return str;
01194 }
01195 
01196 
01197 void NStr::DoubleToString(string& out_str, double value,
01198                           int precision, TNumToStringFlags flags)
01199 {
01200     char buffer[kMaxDoubleStringSize];
01201     if (precision >= 0) {
01202         SIZE_TYPE n = DoubleToString(value, precision, buffer,
01203                                      kMaxDoubleStringSize, flags);
01204         buffer[n] = '\0';
01205     } else {
01206         const char* format;
01207         switch (flags & fDoubleGeneral) {
01208             case fDoubleFixed:
01209                 format = "%f";
01210                 break;
01211             case fDoubleScientific:
01212                 format = "%e";
01213                 break;
01214             case fDoubleGeneral: // default
01215             default: 
01216                 format = "%g";
01217                 break;
01218         }
01219         ::sprintf(buffer, format, value);
01220     }
01221     out_str = buffer;
01222 }
01223 
01224 
01225 
01226 SIZE_TYPE NStr::DoubleToString(double value, unsigned int precision,
01227                                char* buf, SIZE_TYPE buf_size,
01228                                TNumToStringFlags flags)
01229 {
01230     char buffer[kMaxDoubleStringSize];
01231     if (precision > (unsigned int)kMaxDoublePrecision) {
01232         precision = (unsigned int)kMaxDoublePrecision;
01233     }
01234     const char* format;
01235     switch (flags & fDoubleGeneral) {
01236         case fDoubleScientific:
01237             format = "%.*e";
01238             break;
01239         case fDoubleGeneral:
01240             format = "%.*g";
01241             break;
01242         case fDoubleFixed: // default
01243         default:
01244             format = "%.*f";
01245             break;
01246     }
01247     int n = ::sprintf(buffer, format, (int)precision, value);
01248     SIZE_TYPE n_copy = min((SIZE_TYPE) n, buf_size);
01249     memcpy(buf, buffer, n_copy);
01250     return n_copy;
01251 }
01252 
01253 
01254 string NStr::PtrToString(const void* value)
01255 {
01256     char buffer[64];
01257     ::sprintf(buffer, "%p", value);
01258     return buffer;
01259 }
01260 
01261 
01262 void NStr::PtrToString(string& out_str, const void* value)
01263 {
01264     char buffer[64];
01265     ::sprintf(buffer, "%p", value);
01266     out_str = buffer;
01267 }
01268 
01269 
01270 const void* NStr::StringToPtr(const string& str)
01271 {
01272     void *ptr = NULL;
01273     ::sscanf(str.c_str(), "%p", &ptr);
01274     return ptr;
01275 }
01276 
01277 
01278 static const char* s_kTrueString  = "true";
01279 static const char* s_kFalseString = "false";
01280 static const char* s_kTString     = "t";
01281 static const char* s_kFString     = "f";
01282 static const char* s_kYesString   = "yes";
01283 static const char* s_kNoString    = "no";
01284 static const char* s_kYString     = "y";
01285 static const char* s_kNString     = "n";
01286 
01287 
01288 const string NStr::BoolToString(bool value)
01289 {
01290     return value ? s_kTrueString : s_kFalseString;
01291 }
01292 
01293 
01294 bool NStr::StringToBool(const string& str)
01295 {
01296     if ( AStrEquiv(str, s_kTrueString,  PNocase())  ||
01297          AStrEquiv(str, s_kTString,     PNocase())  ||
01298          AStrEquiv(str, s_kYesString,   PNocase())  ||
01299          AStrEquiv(str, s_kYString,     PNocase()) )
01300         return true;
01301 
01302     if ( AStrEquiv(str, s_kFalseString, PNocase())  ||
01303          AStrEquiv(str, s_kFString,     PNocase())  ||
01304          AStrEquiv(str, s_kNoString,    PNocase())  ||
01305          AStrEquiv(str, s_kNString,     PNocase()) )
01306         return false;
01307 
01308     NCBI_THROW2(CStringException, eConvert,
01309                 "String cannot be converted to bool", 0);
01310 }
01311 
01312 
01313 string NStr::FormatVarargs(const char* format, va_list args)
01314 {
01315 #ifdef HAVE_VASPRINTF
01316     char* s;
01317     int n = vasprintf(&s, format, args);
01318     if (n >= 0) {
01319         string str(s, n);
01320         free(s);
01321         return str;
01322     } else {
01323         return kEmptyStr;
01324     }
01325 
01326 #elif defined(NCBI_COMPILER_GCC) && defined(NO_PUBSYNC)
01327     CNcbiOstrstream oss;
01328     oss.vform(format, args);
01329     return CNcbiOstrstreamToString(oss);
01330 
01331 #elif defined(HAVE_VSNPRINTF)
01332     // deal with implementation quirks
01333     SIZE_TYPE size = 1024;
01334     AutoPtr<char, ArrayDeleter<char> > buf(new char[size]);
01335     buf.get()[size-1] = buf.get()[size-2] = 0;
01336     SIZE_TYPE n = vsnprintf(buf.get(), size, format, args);
01337     while (n >= size  ||  buf.get()[size-2]) {
01338         if (buf.get()[size-1]) {
01339             ERR_POST_X(1, Warning << "Buffer overrun by buggy vsnprintf");
01340         }
01341         size = max(size << 1, n);
01342         buf.reset(new char[size]);
01343         buf.get()[size-1] = buf.get()[size-2] = 0;
01344         n = vsnprintf(buf.get(), size, format, args);
01345     }
01346     return (n > 0) ? string(buf.get(), n) : kEmptyStr;
01347 
01348 #elif defined(HAVE_VPRINTF)
01349     char buf[1024];
01350     buf[sizeof(buf) - 1] = 0;
01351     vsprintf(buf, format, args);
01352     if (buf[sizeof(buf) - 1]) {
01353         ERR_POST_X(2, Warning << "Buffer overrun by vsprintf");
01354     }
01355     return buf;
01356 
01357 #else
01358 #  error Please port this code to your system.
01359 #endif
01360 }
01361 
01362 
01363 SIZE_TYPE NStr::FindNoCase(const string& str, const string& pattern,
01364                            SIZE_TYPE start, SIZE_TYPE end, EOccurrence where)
01365 {
01366     string    pat(pattern, 0, 1);
01367     SIZE_TYPE l = pattern.size();
01368     if (isupper((unsigned char) pat[0])) {
01369         pat += (char) tolower((unsigned char) pat[0]);
01370     } else if (islower((unsigned char) pat[0])) {
01371         pat += (char) toupper((unsigned char) pat[0]);
01372     }
01373     if (where == eFirst) {
01374         SIZE_TYPE pos = str.find_first_of(pat, start);
01375         while (pos != NPOS  &&  pos <= end
01376                &&  CompareNocase(str, pos, l, pattern) != 0) {
01377             pos = str.find_first_of(pat, pos + 1);
01378         }
01379         return pos > end ? NPOS : pos;
01380     } else { // eLast
01381         SIZE_TYPE pos = str.find_last_of(pat, end);
01382         while (pos != NPOS  &&  pos >= start
01383                &&  CompareNocase(str, pos, l, pattern) != 0) {
01384             if (pos == 0) {
01385                 return NPOS;
01386             }
01387             pos = str.find_last_of(pat, pos - 1);
01388         }
01389         return pos < start ? NPOS : pos;
01390     }
01391 }
01392 
01393 
01394 const string* NStr::Find(const list <string>& lst, const string& val,
01395                          ECase use_case)
01396 {
01397    if (lst.empty()) return NULL;
01398 
01399    ITERATE (list<string>, st_itr, lst) {
01400        if (Equal(*st_itr, val, use_case)) {
01401            return &*st_itr;
01402        }
01403    }
01404 
01405    return NULL;
01406 }
01407 
01408 const string* NStr::Find(const vector <string>& vec, const string& val,
01409                          ECase use_case)
01410 {
01411    if (vec.empty()) return NULL;
01412 
01413    ITERATE (vector<string>, st_itr, vec) {
01414        if (Equal(*st_itr, val, use_case)) {
01415            return &*st_itr;
01416        }
01417    }
01418 
01419    return NULL;
01420 }
01421 
01422 
01423 template <class TStr>
01424 TStr s_TruncateSpaces(const TStr& str, NStr::ETrunc where,
01425                       const TStr& empty_str)
01426 {
01427     SIZE_TYPE length = str.length();
01428     if (length == 0) {
01429         return empty_str;
01430     }
01431     SIZE_TYPE beg = 0;
01432     if (where == NStr::eTrunc_Begin  ||  where == NStr::eTrunc_Both) {
01433         _ASSERT(beg < length);
01434         while ( isspace((unsigned char) str[beg]) ) {
01435             if (++beg == length) {
01436                 return empty_str;
01437             }
01438         }
01439     }
01440     SIZE_TYPE end = length;
01441     if ( where == NStr::eTrunc_End  ||  where == NStr::eTrunc_Both ) {
01442         _ASSERT(end > beg);
01443         for (--end;  isspace((unsigned char)str[end]);  --end) {
01444             if (end == beg) {
01445                 return empty_str;
01446             }
01447         }
01448         _ASSERT(end >= beg && !isspace((unsigned char) str[end]));
01449         ++end;
01450     }
01451     _ASSERT(beg <= end);
01452     if (beg == end) {
01453         return empty_str;
01454     }
01455     else if ( beg  ||  (end - length) ) {
01456         // if either beg != 0 or end != length
01457         return str.substr(beg, end - beg);
01458     }
01459     else {
01460         return str;
01461     }
01462 }
01463 
01464 
01465 string NStr::TruncateSpaces(const string& str, ETrunc where)
01466 {
01467     return s_TruncateSpaces(str, where, kEmptyStr);
01468 }
01469 
01470 CTempString NStr::TruncateSpaces(const CTempString& str, ETrunc where)
01471 {
01472     return s_TruncateSpaces(str, where, CTempString());
01473 }
01474 
01475 CTempString NStr::TruncateSpaces(const char* str, ETrunc where)
01476 {
01477     return s_TruncateSpaces(CTempString(str), where, CTempString());
01478 }
01479 
01480 
01481 void NStr::TruncateSpacesInPlace(string& str, ETrunc where)
01482 {
01483     SIZE_TYPE length = str.length();
01484     if (length == 0) {
01485         return;
01486     }
01487     SIZE_TYPE beg = 0;
01488     if ( where == eTrunc_Begin  ||  where == eTrunc_Both ) {
01489         // It's better to use str.data()[] to check string characters
01490         // to avoid implicit modification of the string by non-const operator[]
01491         _ASSERT(beg < length);
01492         while ( isspace((unsigned char) str.data()[beg]) ) {
01493             if (++beg == length) {
01494                 str.erase();
01495                 return;
01496             }
01497         }
01498     }
01499 
01500     SIZE_TYPE end = length;
01501     if ( where == eTrunc_End  ||  where == eTrunc_Both ) {
01502         // It's better to use str.data()[] to check string characters
01503         // to avoid implicit modification of the string by non-const operator[]
01504         _ASSERT(end > beg);
01505         while (isspace((unsigned char) str.data()[--end])) {
01506             if (end == beg) {
01507                 str.erase();
01508                 return;
01509             }
01510         }
01511         _ASSERT(end >= beg  &&  !isspace((unsigned char) str.data()[end]));
01512         ++end;
01513     }
01514     _ASSERT(beg < end);
01515 
01516 #if defined(NCBI_COMPILER_GCC)  &&  (NCBI_COMPILER_VERSION == 304)
01517     // work around a library bug
01518     str.replace(end, length, kEmptyStr);
01519     str.replace(0, beg, kEmptyStr);
01520 #else
01521     if ( (beg - 0) | (end - length) ) { // if either beg != 0 or end != length
01522         str.replace(0, length, str, beg, end - beg);
01523     }
01524 #endif
01525 }
01526 
01527 
01528 string& NStr::Replace(const string& src,
01529                       const string& search, const string& replace,
01530                       string& dst, SIZE_TYPE start_pos, SIZE_TYPE max_replace)
01531 {
01532     // source and destination should not be the same
01533     if (&src == &dst) {
01534         NCBI_THROW2(CStringException, eBadArgs,
01535                     "NStr::Replace():  source and destination are the same",0);
01536     }
01537 
01538     dst = src;
01539 
01540     if ( start_pos + search.size() > src.size() ||
01541          search == replace )
01542         return dst;
01543 
01544     for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
01545         start_pos = dst.find(search, start_pos);
01546         if (start_pos == NPOS)
01547             break;
01548         dst.replace(start_pos, search.size(), replace);
01549         start_pos += replace.size();
01550     }
01551     return dst;
01552 }
01553 
01554 
01555 string NStr::Replace(const string& src,
01556                      const string& search, const string& replace,
01557                      SIZE_TYPE start_pos, SIZE_TYPE max_replace)
01558 {
01559     string dst;
01560     Replace(src, search, replace, dst, start_pos, max_replace);
01561     return dst;
01562 }
01563 
01564 
01565 string& NStr::ReplaceInPlace(string& src,
01566                              const string& search, const string& replace,
01567                              SIZE_TYPE start_pos, SIZE_TYPE max_replace)
01568 {
01569     if ( start_pos + search.size() > src.size()  ||
01570          search == replace )
01571         return src;
01572 
01573     bool equal_len = (search.size() == replace.size());
01574     for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
01575         start_pos = src.find(search, start_pos);
01576         if (start_pos == NPOS)
01577             break;
01578         // On some platforms string's replace() implementation
01579         // is not optimal if size of search and replace strings are equal
01580         if ( equal_len ) {
01581             copy(replace.begin(), replace.end(), src.begin() + start_pos); 
01582         } else {
01583             src.replace(start_pos, search.size(), replace);
01584         }
01585         start_pos += replace.size();
01586     }
01587     return src;
01588 }
01589 
01590 
01591 list<string>& NStr::Split(const string& str, const string& delim,
01592                           list<string>& arr, EMergeDelims merge,
01593                           vector<SIZE_TYPE>* token_pos)
01594 {
01595 
01596     typedef list<string>                                    TContainer;
01597     typedef CStrTokenPosAdapter<vector<SIZE_TYPE> >         TPosArray;
01598     typedef CStrDummyTargetReserve<string, TContainer, 
01599             TPosArray, CStrDummyTokenCount<string > >       TReserve;
01600     typedef CStrTokenize<string, TContainer, 
01601                         TPosArray,
01602                         CStrDummyTokenCount<string>,
01603                         TReserve>                           TSplitter;
01604     TPosArray token_pos_proxy(token_pos);
01605     TSplitter::Do(str, delim, arr, 
01606                   (CStrTokenizeBase::EMergeDelims)merge, 
01607                   token_pos_proxy,
01608                   kEmptyStr);
01609     return arr;
01610 /*
01611     // Special cases
01612     if (str.empty()) {
01613         return arr;
01614     } else if (delim.empty()) {
01615         arr.push_back(str);
01616         if (token_pos)
01617             token_pos->push_back(0);
01618         return arr;
01619     }
01620 
01621     for (SIZE_TYPE pos = 0; ; ) {
01622         SIZE_TYPE prev_pos = (merge == eMergeDelims
01623                               ? str.find_first_not_of(delim, pos)
01624                               : pos);
01625         if (prev_pos == NPOS) {
01626             break;
01627         }
01628         pos = str.find_first_of(delim, prev_pos);
01629         if (pos == NPOS) {
01630             // Avoid using temporary objects
01631             // ~ arr.push_back(str.substr(prev_pos));
01632             arr.push_back(kEmptyStr);
01633             arr.back().assign(str, prev_pos, str.length() - prev_pos);
01634             if (token_pos)
01635                 token_pos->push_back(prev_pos);
01636             break;
01637         } else {
01638             // Avoid using temporary objects
01639             // ~ arr.push_back(str.substr(prev_pos, pos - prev_pos));
01640             arr.push_back(kEmptyStr);
01641             arr.back().assign(str, prev_pos, pos - prev_pos);
01642             if (token_pos)
01643                 token_pos->push_back(prev_pos);
01644             ++pos;
01645         }
01646     }
01647     return arr;
01648 */
01649 }
01650 
01651 
01652 vector<string>& NStr::Tokenize(const string& str, const string& delim,
01653                                vector<string>& arr, EMergeDelims merge,
01654                                vector<SIZE_TYPE>* token_pos)
01655 {
01656     typedef vector<string>                                  TContainer;
01657     typedef CStrTokenPosAdapter<vector<SIZE_TYPE> >         TPosArray;
01658     typedef CStrTargetReserve<string, TContainer, 
01659                               TPosArray, CStringTokenCount> TReserve;
01660     typedef CStrTokenize<string, TContainer, 
01661                         TPosArray,
01662                         CStringTokenCount,
01663                         TReserve>                            TSplitter;
01664     TPosArray token_pos_proxy(token_pos);
01665     TSplitter::Do(str, delim, arr, 
01666                   (CStrTokenizeBase::EMergeDelims)merge,
01667                   token_pos_proxy,
01668                   kEmptyStr);
01669     return arr;
01670 /*
01671     // Special cases
01672     if (str.empty()) {
01673         return arr;
01674     } else if (delim.empty()) {
01675         arr.push_back(str);
01676         if (token_pos)
01677             token_pos->push_back(0);
01678         return arr;
01679     }
01680 
01681     SIZE_TYPE pos, prev_pos;
01682 
01683     // Reserve vector size only for empty vectors.
01684     // For vectors which already have items this usualy works slower.
01685     if ( !arr.size() ) {
01686         // Count number of tokens to determine the array size
01687         size_t tokens = 0;
01688         
01689         for (pos = 0;;) {
01690             prev_pos = (merge == NStr::eMergeDelims ? 
01691                             str.find_first_not_of(delim, pos) : pos);
01692             if (prev_pos == NPOS) {
01693                 break;
01694             } 
01695             pos = str.find_first_of(delim, prev_pos);
01696             ++tokens;
01697             if (pos == NPOS) {
01698                 break;
01699             }
01700             ++pos;
01701         }
01702         arr.reserve(tokens);
01703         if (token_pos)
01704             token_pos->reserve(tokens);
01705 
01706     }
01707 
01708     // Tokenization
01709     for (pos = 0;;) {
01710         prev_pos = (merge == eMergeDelims ? 
01711                         str.find_first_not_of(delim, pos) : pos);
01712         if (prev_pos == NPOS) {
01713             break;
01714         }
01715         pos = str.find_first_of(delim, prev_pos);
01716         if (pos == NPOS) {
01717             // Avoid using temporary objects
01718             // ~ arr.push_back(str.substr(prev_pos));
01719             arr.push_back(kEmptyStr);
01720             arr.back().assign(str, prev_pos, str.length() - prev_pos);
01721             if (token_pos)
01722                 token_pos->push_back(prev_pos);
01723             break;
01724         } else {
01725             // Avoid using temporary objects
01726             // ~ arr.push_back(str.substr(prev_pos, pos - prev_pos));
01727             arr.push_back(kEmptyStr);
01728             arr.back().assign(str, prev_pos, pos - prev_pos);
01729             if (token_pos)
01730                 token_pos->push_back(prev_pos);
01731             ++pos;
01732         }
01733     }
01734     return arr;
01735 */
01736 }
01737 
01738 
01739 vector<string>& NStr::TokenizePattern(const string& str,
01740                                       const string& pattern,
01741                                       vector<string>& arr, EMergeDelims merge,
01742                                       vector<SIZE_TYPE>* token_pos)
01743 {
01744     // Special cases
01745     if (str.empty()) {
01746         return arr;
01747     } else if (pattern.empty()) {
01748         arr.push_back(str);
01749         if (token_pos)
01750             token_pos->push_back(0);
01751         return arr;
01752     }
01753 
01754     SIZE_TYPE pos, prev_pos;
01755 
01756     // Reserve vector size only for empty vectors.
01757     // For vectors which already have items this usualy works slower.
01758     if ( !arr.size() ) {
01759         // Count number of tokens to determine the array size
01760         size_t tokens = 0;
01761         for (pos = 0, prev_pos = 0; ; ) {
01762             pos = str.find(pattern, prev_pos);
01763             if ( merge != eMergeDelims  ||  pos > prev_pos ) {
01764                 if (pos == NPOS) {
01765                     if (merge != eMergeDelims  ||  
01766                         prev_pos < str.length() ) {
01767                         ++tokens;
01768                     }
01769                     break;
01770                 }
01771                 ++tokens;
01772             }
01773             prev_pos = pos + pattern.length();
01774         }
01775         arr.reserve(tokens);
01776         if (token_pos)
01777             token_pos->reserve(tokens);
01778     }
01779 
01780     // Tokenization
01781     for (pos = 0, prev_pos = 0; ; ) {
01782         pos = str.find(pattern, prev_pos);
01783         if ( merge != eMergeDelims  ||  pos > prev_pos ) {
01784             if (pos == NPOS) {
01785                 if (merge != eMergeDelims  ||  
01786                     prev_pos < str.length() ) {
01787                     // Avoid using temporary objects
01788                     // ~ arr.push_back(str.substr(prev_pos));
01789                     arr.push_back(kEmptyStr);
01790                     arr.back().assign(str, prev_pos,
01791                                       str.length() - prev_pos);
01792                     if (token_pos)
01793                         token_pos->push_back(prev_pos);
01794                 }
01795                 break;
01796             }
01797             // Avoid using temporary objects
01798             // ~ arr.push_back(str.substr(prev_pos, pos - prev_pos));
01799             arr.push_back(kEmptyStr);
01800             arr.back().assign(str, prev_pos, pos - prev_pos);
01801             if (token_pos)
01802                 token_pos->push_back(prev_pos);
01803         }
01804         prev_pos = pos + pattern.length();
01805     }
01806     return arr;
01807 }
01808 
01809 
01810 bool NStr::SplitInTwo(const string& str, const string& delim,
01811                       string& str1, string& str2)
01812 {
01813     SIZE_TYPE delim_pos = str.find_first_of(delim);
01814     if (NPOS == delim_pos) {   // only one piece.
01815         str1 = str;
01816         str2 = kEmptyStr;
01817         return false;
01818     }
01819     str1.assign(str, 0, delim_pos);
01820     // skip only one delimiter character.
01821     str2.assign(str, delim_pos + 1, str.length() - delim_pos - 1);
01822     
01823     return true;
01824 }
01825 
01826 
01827 template <typename T>
01828 string s_NStr_Join(const T& arr, const string& delim)
01829 {
01830     if (arr.empty()) {
01831         return kEmptyStr;
01832     }
01833 
01834     string result = arr.front();
01835     typename T::const_iterator it = arr.begin();
01836     SIZE_TYPE needed = result.size();
01837 
01838     while (++it != arr.end()) {
01839         needed += delim.size() + it->size();
01840     }
01841     result.reserve(needed);
01842     it = arr.begin();
01843     while (++it != arr.end()) {
01844         result += delim;
01845         result += *it;
01846     }
01847     return result;
01848 }
01849 
01850 
01851 string NStr::Join(const list<string>& arr, const string& delim)
01852 {
01853     return s_NStr_Join(arr, delim);
01854 }
01855 
01856 
01857 string NStr::Join(const vector<string>& arr, const string& delim)
01858 {
01859     return s_NStr_Join(arr, delim);
01860 }
01861 
01862 
01863 enum ELanguage {
01864     eLanguage_C,
01865     eLanguage_Javascript
01866 };
01867 
01868 
01869 static inline bool s_IsQuoted(char c, ELanguage lang)
01870 {
01871     return (c == '\t'  ||   c == '\v'  ||  c == '\b'                      ||
01872             c == '\r'  ||   c == '\f'  ||  c == '\a'                      ||
01873             c == '\n'  ||   c == '\\'  ||  c == '\''                      ||
01874             c == '"'   ||  (c == '&'   &&  lang == eLanguage_Javascript)  ||
01875             !isprint((unsigned char) c) ? true : false);
01876 }
01877 
01878 
01879 static string s_PrintableString(const string&        str,
01880                                 NStr::TPrintableMode mode,
01881                                 ELanguage            lang)
01882 {
01883     auto_ptr<CNcbiOstrstream> out;
01884     SIZE_TYPE i, j = 0;
01885 
01886     for (i = 0;  i < str.size();  i++) {
01887         char c = str[i];
01888         switch (c) {
01889         case '\t':
01890             c = 't';
01891             break;
01892         case '\v':
01893             c = 'v';
01894             break;
01895         case '\b':
01896             c = 'b';
01897             break;
01898         case '\r':
01899             c = 'r';
01900             break;
01901         case '\f':
01902             c = 'f';
01903             break;
01904         case '\a':
01905             c = 'a';
01906             break;
01907         case '\n':
01908             if (!(mode & NStr::fNewLine_Passthru))
01909                 c = 'n';
01910             /*FALLTHRU*/
01911         case '\\':
01912         case '\'':
01913         case '"':
01914             break;
01915         case '&':
01916             if (lang != eLanguage_Javascript)
01917                 continue;
01918             break;
01919         default:
01920             if (isprint((unsigned char) c))
01921                 continue;
01922             break;
01923         }
01924         if (!out.get()) {
01925             out.reset(new CNcbiOstrstream);
01926         }
01927         if (i > j) {
01928             out->write(str.data() + j, i - j);
01929         }
01930         out->put('\\');
01931         if (c == '\n') {
01932             out->write("n\\\n", 3);
01933         } else if (!isprint((unsigned char) c)) {
01934             bool reduce;
01935             if (!(mode & NStr::fPrintable_Full)) {
01936                 reduce = (i == str.size() - 1  ||  s_IsQuoted(str[i + 1], lang)
01937                           ||  str[i + 1] < '0'  ||  str[i + 1] > '7');
01938             } else {
01939                 reduce = false;
01940             }
01941             unsigned char v;
01942             char octal[3];
01943             int k = 0;
01944             v =  (unsigned char) c >> 6;
01945             if (v  ||  !reduce) {
01946                 octal[k++] = '0' + v;
01947                 reduce = false;
01948             }
01949             v = ((unsigned char) c >> 3) & 7;
01950             if (v  ||  !reduce) {
01951                 octal[k++] = '0' + v;
01952             }
01953             v =  (unsigned char) c & 7;
01954             octal    [k++] = '0' + v;
01955             out->write(octal, k);
01956         } else {
01957             out->put(c);
01958         }
01959         j = i + 1;
01960     }
01961     if (j  &&  i > j) {
01962         _ASSERT(out.get());
01963         out->write(str.data() + j, i - j);
01964     }
01965     if (out.get()) {
01966         // Return encoded string
01967         return CNcbiOstrstreamToString(*out);
01968     }
01969 
01970     // All characters are good - return original string
01971     return str;
01972 }
01973 
01974         
01975 string NStr::PrintableString(const string&        str,
01976                              NStr::TPrintableMode mode)
01977 {
01978     return s_PrintableString(str, mode, eLanguage_C);
01979 }
01980 
01981 
01982 string NStr::JavaScriptEncode(const string& str)
01983 {
01984     return s_PrintableString(str, eNewLine_Quote, eLanguage_Javascript);
01985 }
01986 
01987 string NStr::XmlEncode(const string& str)
01988 // http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent
01989 {
01990     string result;
01991     SIZE_TYPE i;
01992     for (i = 0;  i < str.size();  i++) {
01993         char c = str[i];
01994         switch ( c ) {
01995         case '&':
01996             result.append("&amp;");
01997             break;
01998         case '<':
01999             result.append("&lt;");
02000             break;
02001         case '>':
02002             result.append("&gt;");
02003             break;
02004         case '\'':
02005             result.append("&apos;");
02006             break;
02007         case '"':
02008             result.append("&quot;");
02009             break;
02010         default:
02011             if ((unsigned int)(c) < 0x20) {
02012                 const char* charmap = "0123456789abcdef";
02013                 result.append("&#x");
02014                 Uint1 ch = c;
02015                 unsigned hi = ch >> 4;
02016                 unsigned lo = ch & 0xF;
02017                 if ( hi ) {
02018                     result.append(1, charmap[hi]);
02019                 }
02020                 result.append(1, charmap[lo]).append(1, ';');
02021             } else {
02022                 result.append(1, c);
02023             }
02024             break;
02025         }
02026     }
02027     return result;
02028 }
02029 
02030 string NStr::JsonEncode(const string& str)
02031 // http://www.json.org/
02032 {
02033     string result;
02034     SIZE_TYPE i;
02035     for (i = 0;  i < str.size();  i++) {
02036         char c = str[i];
02037         switch ( c ) {
02038         case '"':
02039             result.append("\\\"");
02040             break;
02041         case '\\':
02042             result.append("\\\\");
02043             break;
02044         default:
02045             if ((unsigned int)c < 0x20 || (unsigned int)c >= 0x80) {
02046                 const char* charmap = "0123456789abcdef";
02047                 result.append("\\u00");
02048                 Uint1 ch = c;
02049                 unsigned hi = ch >> 4;
02050                 unsigned lo = ch & 0xF;
02051                 result.append(1, charmap[hi]);
02052                 result.append(1, charmap[lo]);
02053             } else {
02054                 result.append(1, c);
02055             }
02056             break;
02057         }
02058     }
02059     return result;
02060 }
02061 
02062 
02063 string NStr::ParseEscapes(const string& str)
02064 {
02065     string out;
02066     out.reserve(str.size()); // can only be smaller
02067     SIZE_TYPE pos = 0;
02068 
02069     while (pos < str.size()) {
02070         SIZE_TYPE pos2 = str.find('\\', pos);
02071         if (pos2 == NPOS) {
02072             out += str.substr(pos);
02073             break;
02074         }
02075         out += str.substr(pos, pos2 - pos);
02076         if (++pos2 == str.size()) {
02077             NCBI_THROW2(CStringException, eFormat,
02078                         "Unterminated escape sequence", pos2);
02079         }
02080         switch (str[pos2]) {
02081         case 'a':  out += '\a';  break;
02082         case 'b':  out += '\b';  break;
02083         case 'f':  out += '\f';  break;
02084         case 'n':  out += '\n';  break;
02085         case 'r':  out += '\r';  break;
02086         case 't':  out += '\t';  break;
02087         case 'v':  out += '\v';  break;
02088         case 'x':
02089             {{
02090                 pos = ++pos2;
02091                 while (pos < str.size()
02092                        &&  isxdigit((unsigned char) str[pos])) {
02093                     pos++;
02094                 }
02095                 if (pos > pos2) {
02096                     out += static_cast<char>
02097                         (StringToUInt(str.substr(pos2, pos - pos2), 0, 16));
02098                 } else {
02099                     NCBI_THROW2(CStringException, eFormat,
02100                                 "\\x followed by no hexadecimal digits", pos);
02101                 }
02102             }}
02103             continue;
02104         case '0':  case '1':  case '2':  case '3':
02105         case '4':  case '5':  case '6':  case '7':
02106             {{
02107                 pos = pos2;
02108                 unsigned char c = str[pos++] - '0';
02109                 while (pos < pos2 + 3  &&  pos < str.size()
02110                        &&  str[pos] >= '0'  &&  str[pos] <= '7') {
02111                     c = (c << 3) | (str[pos++] - '0');
02112                 }
02113                 out += c;
02114             }}
02115             continue;
02116         case '\n':
02117             /*quoted EOL means no EOL*/
02118             break;
02119         default:
02120             out += str[pos2];
02121             break;
02122         }
02123         pos = pos2 + 1;
02124     }
02125     return out;
02126 }
02127 
02128 
02129 // Determines the end of an HTML <...> tag, accounting for attributes
02130 // and comments (the latter allowed only within <!...>).
02131 static SIZE_TYPE s_EndOfTag(const string& str, SIZE_TYPE start)
02132 {
02133     _ASSERT(start < str.size()  &&  str[start] == '<');
02134     bool comments_ok = (start + 1 < str.size()  &&  str[start + 1] == '!');
02135     for (SIZE_TYPE pos = start + 1;  pos < str.size();  ++pos) {
02136         switch (str[pos]) {
02137         case '>': // found the end
02138             return pos;
02139 
02140         case '\"': // start of "string"; advance to end
02141             pos = str.find('\"', pos + 1);
02142             if (pos == NPOS) {
02143                 NCBI_THROW2(CStringException, eFormat,
02144                             "Unclosed string in HTML tag", start);
02145                 // return pos;
02146             }
02147             break;
02148 
02149         case '-': // possible start of -- comment --; advance to end
02150             if (comments_ok  &&  pos + 1 < str.size()
02151                 &&  str[pos + 1] == '-') {
02152                 pos = str.find("--", pos + 2);
02153                 if (pos == NPOS) {
02154                     NCBI_THROW2(CStringException, eFormat,
02155                                 "Unclosed comment in HTML tag", start);
02156                     // return pos;
02157                 } else {
02158                     ++pos;
02159                 }
02160             }
02161         }
02162     }
02163     NCBI_THROW2(CStringException, eFormat, "Unclosed HTML tag", start);
02164     // return NPOS;
02165 }
02166 
02167 
02168 // Determines the end of an HTML &foo; character/entity reference
02169 // (which might not actually end with a semicolon :-/)
02170 static SIZE_TYPE s_EndOfReference(const string& str, SIZE_TYPE start)
02171 {
02172     _ASSERT(start < str.size()  &&  str[start] == '&');
02173 #ifdef NCBI_STRICT_HTML_REFS
02174     return str.find(';', start + 1);
02175 #else
02176     SIZE_TYPE pos = str.find_first_not_of
02177         ("#0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
02178          start + 1);
02179     if (pos == NPOS  ||  str[pos] == ';') {
02180         return pos;
02181     } else {
02182         return pos - 1;
02183     }
02184 #endif
02185 }
02186 
02187 
02188 static SIZE_TYPE s_VisibleWidth(const string& str, bool is_html)
02189 {
02190     if (is_html) {
02191         SIZE_TYPE width = 0, pos = 0;
02192         for (;;) {
02193             SIZE_TYPE pos2 = str.find_first_of("<&", pos);
02194             if (pos2 == NPOS) {
02195                 width += str.size() - pos;
02196                 break;
02197             } else {
02198                 width += pos2 - pos;
02199                 if (str[pos2] == '&') {
02200                     ++width;
02201                     pos = s_EndOfReference(str, pos);
02202                 } else {
02203                     pos = s_EndOfTag(str, pos);
02204                 }
02205                 if (pos == NPOS) {
02206                     break;
02207                 } else {
02208                     ++pos;
02209                 }
02210             }
02211         }
02212         return width;
02213     } else {
02214         return str.size();
02215     }
02216 }
02217 
02218 
02219 list<string>& NStr::Wrap(const string& str, SIZE_TYPE width,
02220                          list<string>& arr, NStr::TWrapFlags flags,
02221                          const string* prefix, const string* prefix1)
02222 {
02223     if (prefix == 0) {
02224         prefix = &kEmptyStr;
02225     }
02226 
02227     const string* pfx = prefix1 ? prefix1 : prefix;
02228     SIZE_TYPE     pos = 0, len = str.size(), nl_pos = 0;
02229     
02230     bool          is_html  = flags & fWrap_HTMLPre ? true : false;
02231     bool          do_flat = (flags & fWrap_FlatFile) != 0;
02232 
02233     enum EScore { // worst to best
02234         eForced,
02235         ePunct,
02236         eComma,
02237         eSpace,
02238         eNewline
02239     };
02240 
02241     while (pos < len) {
02242         bool      hyphen     = false; // "-" or empty
02243         SIZE_TYPE column     = s_VisibleWidth(*pfx, is_html);
02244         SIZE_TYPE column0    = column;
02245         // the next line will start at best_pos
02246         SIZE_TYPE best_pos   = NPOS;
02247         EScore    best_score = eForced;
02248         SIZE_TYPE pos0       = pos;
02249         if (nl_pos <= pos) {
02250             nl_pos = str.find('\n', pos);
02251             if (nl_pos == NPOS) {
02252                 nl_pos = len;
02253             }
02254         }
02255         if (column + (nl_pos-pos) <= width) {
02256             pos0 = nl_pos;
02257         }
02258         for (SIZE_TYPE pos2 = pos0;  pos2 < len  &&  column <= width;
02259              ++pos2, ++column) {
02260             EScore    score     = eForced;
02261             SIZE_TYPE score_pos = pos2;
02262             char      c         = str[pos2];
02263 
02264             if (c == '\n') {
02265                 best_pos   = pos2;
02266                 best_score = eNewline;
02267                 break;
02268             } else if (isspace((unsigned char) c)) {
02269                 if ( !do_flat  &&  pos2 > 0  &&
02270                      isspace((unsigned char) str[pos2 - 1])) {
02271                     continue; // take the first space of a group
02272                 }
02273                 score = eSpace;
02274             } else if (is_html  &&  c == '<') {
02275                 // treat tags as zero-width...
02276                 pos2 = s_EndOfTag(str, pos2);
02277                 --column;
02278             } else if (is_html  &&  c == '&') {
02279                 // ...and references as single characters
02280                 pos2 = s_EndOfReference(str, pos2);
02281             } else if (c == ','  &&  score_pos < len - 1  &&  column < width) {
02282                 score = eComma;
02283                 ++score_pos;
02284             } else if (do_flat ? c == '-' : ispunct((unsigned char) c)) {
02285                 // For flat files, only whitespace, hyphens and commas
02286                 // are special.
02287                 if (c == '('  ||  c == '['  ||  c == '{'  ||  c == '<'
02288                     ||  c == '`') { // opening element
02289                     score = ePunct;
02290                 } else if (score_pos < len - 1  &&  column < width) {
02291                     // Prefer breaking *after* most types of punctuation.
02292                     score = ePunct;
02293                     ++score_pos;
02294                 }
02295             }
02296 
02297             if (pos2 == NPOS) {
02298                 break;
02299             }
02300 
02301             if (score >= best_score  &&  score_pos > pos0) {
02302                 best_pos   = score_pos;
02303                 best_score = score;
02304             }
02305 
02306             while (pos2 < len - 1  &&  str[pos2 + 1] == '\b') {
02307                 // Account for backspaces
02308                 ++pos2;
02309                 if (column > column0) {
02310                     --column;
02311                 }
02312             }
02313         }
02314 
02315         if ( best_score != eNewline  &&  column <= width ) {
02316             // If the whole remaining text can fit, don't split it...
02317             best_pos = len;
02318         } else if ( best_score == eForced  &&  (flags & fWrap_Hyphenate) ) {
02319             hyphen = true;
02320             --best_pos;
02321         }
02322         arr.push_back(*pfx);
02323         {{ // eat backspaces and the characters (if any) that precede them
02324             string::const_iterator begin = str.begin() + pos;
02325             string::const_iterator end = str.begin() + best_pos;
02326             string::const_iterator bs; // position of next backspace
02327             while ((bs = find(begin, end, '\b')) != end) {
02328                 if (bs != begin) {
02329                     // add all except the last one
02330                     arr.back().append(begin, bs - 1);
02331                 }
02332                 else {
02333                     // The backspace is at the beginning of next substring,
02334                     // so we should remove previously added symbol if any.
02335                     SIZE_TYPE size = arr.back().size();
02336                     if (size > pfx->size()) { // current size > prefix size
02337                         arr.back().resize(size - 1);
02338                     }
02339                 }
02340                 // skip over backspace
02341                 begin = bs + 1;
02342             }
02343             if (begin != end) {
02344                 // add remaining characters
02345                 arr.back().append(begin, end);
02346             }
02347         }}
02348         if ( hyphen ) {
02349             arr.back() += '-';
02350         }
02351         pos = best_pos;
02352         pfx = prefix;
02353 
02354         if (do_flat) {
02355             if (best_score == eSpace) {
02356                 while (str[pos] == ' ') {
02357                     ++pos;
02358                 }
02359                 if (str[pos] == '\n') {
02360                     ++pos;
02361                 }
02362             }
02363             if (best_score == eNewline) {
02364                 ++pos;
02365             }
02366         }
02367         else {
02368             if ( best_score == eSpace  ||  best_score == eNewline ) {
02369                 ++pos;
02370             }
02371         }
02372         while (pos < len  &&  str[pos] == '\b') {
02373             ++pos;
02374         }
02375     }
02376 
02377     return arr;
02378 }
02379 
02380 
02381 list<string>& NStr::WrapList(const list<string>& l, SIZE_TYPE width,
02382                              const string& delim, list<string>& arr,
02383                              NStr::TWrapFlags flags, const string* prefix,
02384                              const string* prefix1)
02385 {
02386     if (l.empty()) {
02387         return arr;
02388     }
02389 
02390     const string* pfx      = prefix1 ? prefix1 : prefix;
02391     string        s        = *pfx;
02392     bool          is_html  = flags & fWrap_HTMLPre ? true : false;
02393     SIZE_TYPE     column   = s_VisibleWidth(s,     is_html);
02394     SIZE_TYPE     delwidth = s_VisibleWidth(delim, is_html);
02395     bool          at_start = true;
02396 
02397     ITERATE (list<string>, it, l) {
02398         SIZE_TYPE term_width = s_VisibleWidth(*it, is_html);
02399         if ( at_start ) {
02400             if (column + term_width <= width) {
02401                 s += *it;
02402                 column += term_width;
02403                 at_start = false;
02404             } else {
02405                 // Can't fit, even on its own line; break separately.
02406                 Wrap(*it, width, arr, flags, prefix, pfx);
02407                 pfx      = prefix;
02408                 s        = *prefix;
02409                 column   = s_VisibleWidth(s, is_html);
02410                 at_start = true;
02411             }
02412         } else if (column + delwidth + term_width <= width) {
02413             s += delim;
02414             s += *it;
02415             column += delwidth + term_width;
02416             at_start = false;
02417         } else {
02418             // Can't fit on this line; break here and try again.
02419             arr.push_back(s);
02420             pfx      = prefix;
02421             s        = *prefix;
02422             column   = s_VisibleWidth(s, is_html);
02423             at_start = true;
02424             --it;
02425         }
02426     }
02427 
02428     arr.push_back(s);
02429     return arr;
02430 }
02431 
02432 
02433 #if !defined(HAVE_STRDUP)
02434 extern char* strdup(const char* str)
02435 {
02436     if ( !str ) {
02437         return 0;
02438     }
02439     size_t size   = strlen(str) + 1;
02440     void*  result = malloc(size);
02441     return (char*)(result ? memcpy(result, str, size) : 0);
02442 }
02443 #endif
02444 
02445 
02446 static const char s_Encode[256][4] = {
02447     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02448     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02449     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02450     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02451     "+",   "!",   "%22", "%23", "$",   "%25", "%26", "'",
02452     "(",   ")",   "*",   "%2B", ",",   "-",   ".",   "%2F",
02453     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02454     "8",   "9",   "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
02455     "%40", "A",   "B",   "C",   "D",   "E",   "F",   "G",
02456     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02457     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02458     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "_",
02459     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02460     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02461     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02462     "x",   "y",   "z",   "%7B", "%7C", "%7D", "%7E", "%7F",
02463     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02464     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02465     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02466     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02467     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02468     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02469     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02470     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02471     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02472     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02473     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02474     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02475     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02476     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02477     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02478     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02479 };
02480 
02481 static const char s_EncodeMarkChars[256][4] = {
02482     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02483     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02484     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02485     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02486     "+",   "%21", "%22", "%23", "%24", "%25", "%26", "%27",
02487     "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F",
02488     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02489     "8",   "9",   "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
02490     "%40", "A",   "B",   "C",   "D",   "E",   "F",   "G",
02491     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02492     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02493     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "%5F",
02494     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02495     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02496     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02497     "x",   "y",   "z",   "%7B", "%7C", "%7D", "%7E", "%7F",
02498     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02499     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02500     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02501     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02502     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02503     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02504     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02505     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02506     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02507     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02508     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02509     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02510     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02511     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02512     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02513     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02514 };
02515 
02516 static const char s_EncodePercentOnly[256][4] = {
02517     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02518     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02519     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02520     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02521     "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
02522     "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F",
02523     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02524     "8",   "9",   "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
02525     "%40", "A",   "B",   "C",   "D",   "E",   "F",   "G",
02526     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02527     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02528     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "%5F",
02529     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02530     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02531     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02532     "x",   "y",   "z",   "%7B", "%7C", "%7D", "%7E", "%7F",
02533     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02534     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02535     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02536     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02537     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02538     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02539     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02540     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02541     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02542     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02543     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02544     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02545     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02546     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02547     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02548     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02549 };
02550 
02551 static const char s_EncodePath[256][4] = {
02552     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02553     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02554     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02555     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02556     "+",   "%21", "%22", "%23", "%24", "%25", "%26", "%27",
02557     "%28", "%29", "%2A", "%2B", "%2C", "%2D", ".",   "/",
02558     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02559     "8",   "9",   "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
02560     "%40", "A",   "B",   "C",   "D",   "E",   "F",   "G",
02561     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02562     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02563     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "_",
02564     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02565     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02566     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02567     "x",   "y",   "z",   "%7B", "%7C", "%7D", "%7E", "%7F",
02568     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02569     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02570     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02571     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02572     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02573     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02574     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02575     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02576     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02577     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02578     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02579     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02580     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02581     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02582     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02583     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02584 };
02585 
02586 static const char s_EncodeURIScheme[256][4] = {
02587     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02588     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02589     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02590     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", 
02591     "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
02592     "%28", "%29", "%2A", "+",   "%2C", "-",   ".",   "%2F",
02593     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02594     "8",   "9",   "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
02595     "%40", "A",   "B",   "C",   "D",   "E",   "F",   "G",
02596     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02597     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02598     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "%5F",
02599     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02600     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02601     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02602     "x",   "y",   "z",   "%7B", "%7C", "%7D", "%7E", "%7F",
02603     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02604     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02605     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02606     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02607     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02608     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02609     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02610     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02611     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02612     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02613     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02614     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02615     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02616     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02617     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02618     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02619 };
02620 
02621 static const char s_EncodeURIUserinfo[256][4] = {
02622     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02623     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02624     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02625     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02626     "%20", "!",   "%22", "%23", "$",   "%25", "&",   "'",
02627     "(",   ")",   "%2A", "%2B", "%2C", "-",   ".",   "%2F",
02628     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02629     "8",   "9",   ":",   "%3B", "%3C", "%3D", "%3E", "%3F",
02630     "%40", "A",   "B",   "C",   "D",   "E",   "F",   "G",
02631     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02632     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02633     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "_",
02634     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02635     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02636     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02637     "x",   "y",   "z",   "%7B", "%7C", "%7D", "~",   "%7F",
02638     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02639     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02640     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02641     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02642     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02643     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02644     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02645     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02646     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02647     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02648     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02649     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02650     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02651     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02652     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02653     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02654 };
02655 
02656 static const char s_EncodeURIHost[256][4] = {
02657     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02658     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02659     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02660     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02661     "%20", "!",   "%22", "%23", "$",   "%25", "&",   "'",
02662     "(",   ")",   "%2A", "%2B", "%2C", "-",   ".",   "%2F",
02663     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02664     "8",   "9",   "%3A", "%3B", "%3C", "%3D", "%3E", "%3F",
02665     "%40", "A",   "B",   "C",   "D",   "E",   "F",   "G",
02666     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02667     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02668     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "_",
02669     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02670     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02671     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02672     "x",   "y",   "z",   "%7B", "%7C", "%7D", "~",   "%7F",
02673     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02674     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02675     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02676     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02677     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02678     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02679     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02680     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02681     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02682     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02683     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02684     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02685     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02686     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02687     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02688     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02689 };
02690 
02691 static const char s_EncodeURIPath[256][4] = {
02692     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02693     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02694     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02695     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02696     "%20", "!",   "%22", "%23", "$",   "%25", "&",   "'",
02697     "(",   ")",   "%2A", "%2B", "%2C", "-",   ".",   "/",
02698     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02699     "8",   "9",   ":",   "%3B", "%3C", "%3D", "%3E", "%3F",
02700     "@",   "A",   "B",   "C",   "D",   "E",   "F",   "G",
02701     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02702     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02703     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "_",
02704     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02705     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02706     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02707     "x",   "y",   "z",   "%7B", "%7C", "%7D", "~",   "%7F",
02708     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02709     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02710     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02711     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02712     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02713     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02714     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02715     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02716     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02717     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02718     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02719     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02720     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02721     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02722     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02723     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02724 };
02725 
02726 static const char s_EncodeURIQueryName[256][4] = {
02727     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02728     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02729     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02730     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02731     "%20", "!",   "%22", "%23", "$",   "%25", "&",   "'",
02732     "(",   ")",   "%2A", "%2B", "%2C", "-",   ".",   "/",
02733     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02734     "8",   "9",   ":",   "%3B", "%3C", "%3D", "%3E", "?",
02735     "@",   "A",   "B",   "C",   "D",   "E",   "F",   "G",
02736     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02737     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02738     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "_",
02739     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02740     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02741     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02742     "x",   "y",   "z",   "%7B", "%7C", "%7D", "~",   "%7F",
02743     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02744     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02745     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02746     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02747     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02748     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02749     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02750     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02751     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02752     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02753     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02754     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02755     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02756     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02757     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02758     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02759 };
02760 
02761 static const char s_EncodeURIQueryValue[256][4] = {
02762     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02763     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02764     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02765     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02766     "%20", "!",   "%22", "%23", "$",   "%25", "&",   "'",
02767     "(",   ")",   "%2A", "%2B", "%2C", "-",   ".",   "/",
02768     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02769     "8",   "9",   ":",   "%3B", "%3C", "%3D", "%3E", "?",
02770     "@",   "A",   "B",   "C",   "D",   "E",   "F",   "G",
02771     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02772     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02773     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "_",
02774     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02775     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02776     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02777     "x",   "y",   "z",   "%7B", "%7C", "%7D", "~",   "%7F",
02778     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02779     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02780     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02781     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02782     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02783     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02784     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02785     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02786     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02787     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02788     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02789     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02790     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02791     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02792     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02793     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02794 };
02795 
02796 static const char s_EncodeURIFragment[256][4] = {
02797     "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
02798     "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F",
02799     "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
02800     "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
02801     "%20", "!",   "%22", "%23", "$",   "%25", "&",   "'",
02802     "(",   ")",   "%2A", "%2B", "%2C", "-",   ".",   "/",
02803     "0",   "1",   "2",   "3",   "4",   "5",   "6",   "7",
02804     "8",   "9",   ":",   "%3B", "%3C", "%3D", "%3E", "?",
02805     "@",   "A",   "B",   "C",   "D",   "E",   "F",   "G",
02806     "H",   "I",   "J",   "K",   "L",   "M",   "N",   "O",
02807     "P",   "Q",   "R",   "S",   "T",   "U",   "V",   "W",
02808     "X",   "Y",   "Z",   "%5B", "%5C", "%5D", "%5E", "_",
02809     "%60", "a",   "b",   "c",   "d",   "e",   "f",   "g",
02810     "h",   "i",   "j",   "k",   "l",   "m",   "n",   "o",
02811     "p",   "q",   "r",   "s",   "t",   "u",   "v",   "w",
02812     "x",   "y",   "z",   "%7B", "%7C", "%7D", "~",   "%7F",
02813     "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
02814     "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F",
02815     "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
02816     "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F",
02817     "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7",
02818     "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF",
02819     "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
02820     "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF",
02821     "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7",
02822     "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF",
02823     "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7",
02824     "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF",
02825     "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7",
02826     "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF",
02827     "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7",
02828     "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF"
02829 };
02830 
02831 string NStr::URLEncode(const string& str, EUrlEncode flag)
02832 {
02833     SIZE_TYPE len = str.length();
02834     if ( !len ) {
02835         return kEmptyStr;
02836     }
02837 
02838     const char (*encode_table)[4];
02839     switch (flag) {
02840     case eUrlEnc_SkipMarkChars:
02841         encode_table = s_Encode;
02842         break;
02843     case eUrlEnc_ProcessMarkChars:
02844         encode_table = s_EncodeMarkChars;
02845         break;
02846     case eUrlEnc_PercentOnly:
02847         encode_table = s_EncodePercentOnly;
02848         break;
02849     case eUrlEnc_Path:
02850         encode_table = s_EncodePath;
02851         break;
02852     case eUrlEnc_URIScheme:
02853         encode_table = s_EncodeURIScheme;
02854         break;
02855     case eUrlEnc_URIUserinfo:
02856         encode_table = s_EncodeURIUserinfo;
02857         break;
02858     case eUrlEnc_URIHost:
02859         encode_table = s_EncodeURIHost;
02860         break;
02861     case eUrlEnc_URIPath:
02862         encode_table = s_EncodeURIPath;
02863         break;
02864     case eUrlEnc_URIQueryName:
02865         encode_table = s_EncodeURIQueryName;
02866         break;
02867     case eUrlEnc_URIQueryValue:
02868         encode_table = s_EncodeURIQueryValue;
02869         break;
02870     case eUrlEnc_URIFragment:
02871         encode_table = s_EncodeURIFragment;
02872         break;
02873     case eUrlEnc_None:
02874         return str;
02875     default:
02876         _TROUBLE;
02877         // To keep off compiler warning
02878         encode_table = 0;
02879     }
02880 
02881     string dst;
02882     SIZE_TYPE pos;
02883     SIZE_TYPE dst_len = len;
02884     const unsigned char* cstr = (const unsigned char*)str.c_str();
02885     for (pos = 0;  pos < len;  pos++) {
02886         if (encode_table[cstr[pos]][0] == '%')
02887             dst_len += 2;
02888     }
02889     dst.reserve(dst_len + 1);
02890     dst.resize(dst_len);
02891 
02892     SIZE_TYPE p = 0;
02893     for (pos = 0;  pos < len;  pos++, p++) {
02894         const char* subst = encode_table[cstr[pos]];
02895         if (*subst != '%') {
02896             dst[p] = *subst;
02897         } else {
02898             dst[p] = '%';
02899             dst[++p] = *(++subst);
02900             dst[++p] = *(++subst);
02901         }
02902     }
02903 
02904     _ASSERT( p == dst_len );
02905     dst[dst_len] = '\0';
02906     return dst;
02907 }
02908 
02909 
02910 CStringUTF8 NStr::SQLEncode(const CStringUTF8& str) {
02911     SIZE_TYPE     stringSize = str.size();
02912     CStringUTF8   result;
02913 
02914     result.reserve(stringSize + 6);
02915     result.append(1, '\'');
02916     for (SIZE_TYPE i = 0;  i < stringSize;  i++) {
02917         char  c = str[i];
02918         if (c ==  '\'')
02919             result.append(1, '\'');
02920         result.append(1, c);
02921     }
02922     result.append(1, '\'');
02923 
02924     return result;
02925 }
02926 
02927 
02928 void s_URLDecode(const string& src, string& dst, NStr::EUrlDecode flag)
02929 {
02930     SIZE_TYPE len = src.length();
02931     if ( !len ) {
02932         dst.clear();
02933         return;
02934     }
02935     if (dst.length() < src.length()) {
02936         dst.resize(len);
02937     }
02938 
02939     SIZE_TYPE pdst = 0;
02940     for (SIZE_TYPE psrc = 0;  psrc < len;  pdst++) {
02941         switch ( src[psrc] ) {
02942         case '%': {
02943             // Accordingly RFC 1738 the '%' character is unsafe
02944             // and should be always encoded, but sometimes it is
02945             // not really encoded...
02946             if (psrc + 2 > len) {
02947                 dst[pdst] = src[psrc++];
02948             } else {
02949                 int n1 = NStr::HexChar(src[psrc+1]);
02950                 int n2 = NStr::HexChar(src[psrc+2]);
02951                 if (n1 < 0  ||  n1 > 15  || n2 < 0  ||  n2 > 15) {
02952                     dst[pdst] = src[psrc++];
02953                 } else {
02954                     dst[pdst] = (n1 << 4) | n2;
02955                     psrc += 3;
02956                 }
02957             }
02958             break;
02959         }
02960         case '+': {
02961             dst[pdst] = (flag == NStr::eUrlDec_All) ? ' ' : '+';
02962             psrc++;
02963             break;
02964         }
02965         default:
02966             dst[pdst] = src[psrc++];
02967         }
02968     }
02969     if (pdst < len) {
02970         dst[pdst] = '\0';
02971         dst.resize(pdst);
02972     }
02973 }
02974 
02975 
02976 string NStr::URLDecode(const string& str, EUrlDecode flag)
02977 {
02978     string dst;
02979     s_URLDecode(str, dst, flag);
02980     return dst;
02981 }
02982 
02983 
02984 void NStr::URLDecodeInPlace(string& str, EUrlDecode flag)
02985 {
02986     s_URLDecode(str, str, flag);
02987 }
02988 
02989 
02990 bool NStr::NeedsURLEncoding(const string& str, EUrlEncode flag)
02991 {
02992     SIZE_TYPE len = str.length();
02993     if ( !len ) {
02994         return false;
02995     }
02996 
02997     const char (*encode_table)[4];
02998     switch (flag) {
02999     case eUrlEnc_SkipMarkChars:
03000         encode_table = s_Encode;
03001         break;
03002     case eUrlEnc_ProcessMarkChars:
03003         encode_table = s_EncodeMarkChars;
03004         break;
03005     case eUrlEnc_PercentOnly:
03006         encode_table = s_EncodePercentOnly;
03007         break;
03008     case eUrlEnc_Path:
03009         encode_table = s_EncodePath;
03010         break;
03011     case eUrlEnc_None:
03012         return false;
03013     default:
03014         _TROUBLE;
03015         // To keep off compiler warning
03016         encode_table = 0;
03017     }
03018 
03019     const unsigned char* cstr = (const unsigned char*)str.c_str();
03020     for (SIZE_TYPE pos = 0;  pos < len;  pos++) {
03021         const char* subst = encode_table[cstr[pos]];
03022         if (*subst != cstr[pos]) {
03023             return true;
03024         }
03025     }
03026 
03027     return false;
03028 }
03029 
03030 
03031 bool NStr::IsIPAddress(const string& ip)
03032 {
03033     const char* start = ip.c_str();
03034     const char* c = start;
03035     unsigned long val;
03036     int dots = 0;
03037 
03038     for (;;) {
03039         char* e;
03040         if ( !isdigit((unsigned char)(*c)) )
03041             return false;
03042         errno = 0;
03043         val = strtoul(c, &e, 10);
03044         if (c == e  ||  errno)
03045             return false;
03046         c = e;
03047         if (*c != '.')
03048             break;
03049         if (++dots > 3)
03050             return false;
03051         if (val > 255)
03052             return false;
03053         c++;
03054     }
03055 
03056     // Make sure the whole string was checked (it is possible to have \0 chars
03057     // in the middle of the string).
03058     if ((size_t)(c - start) != ip.size()) {
03059         return false;
03060     }
03061     return !*c  &&  dots == 3  &&  val < 256;
03062 }
03063 
03064 
03065 namespace {
03066     // Comparator to decide if a symbol is a delimiter
03067     template <typename TDelimiter>
03068     class PDelimiter
03069     {
03070     private:
03071         const TDelimiter& delimiter;
03072 
03073     public:
03074         PDelimiter(const TDelimiter& delim)
03075             : delimiter(delim)
03076         {}
03077 
03078         bool operator()(char tested_symbol) const;
03079     };
03080 
03081 
03082     // Template search for a field
03083     // @param str
03084     //   C or C++ string to search in.
03085     // @param field_no
03086     //   Zero-based field number.
03087     // @param delimiter
03088     //   Functor to decide if a symbol is a delimiter
03089     // @param merge
03090     //   Whether to merge or not adjacent delimiters.
03091     // @return
03092     //   Found field; or empty string if the required field is not found.
03093     template <typename TComparator, typename TResult>
03094     TResult s_GetField(const CTempString& str,
03095                        size_t             field_no,
03096                        const TComparator& delimiter,
03097                        NStr::EMergeDelims merge)
03098     {
03099         const char*   current_ptr   = str.data();
03100         const char*   end_ptr       = current_ptr + str.length();
03101         size_t        current_field = 0;
03102 
03103         // Search for the beginning of the required field
03104         for ( ;  current_field != field_no;  current_field++) {
03105             while (current_ptr < end_ptr  &&  !delimiter(*current_ptr))
03106                 current_ptr++;
03107 
03108             if (merge == NStr::eMergeDelims) {
03109                 while (current_ptr < end_ptr  &&  delimiter(*current_ptr))
03110                     current_ptr++;
03111             }
03112             else
03113                 current_ptr++;
03114 
03115             if (current_ptr >= end_ptr)
03116                 return TResult();
03117         }
03118 
03119         if (current_field != field_no)
03120             return TResult();
03121 
03122         // Here: current_ptr points to the first character after the delimiter.
03123         const char* field_start = current_ptr;
03124         while (current_ptr < end_ptr  &&  !delimiter(*current_ptr))
03125             current_ptr++;
03126 
03127         return TResult(field_start, current_ptr - field_start);
03128     }
03129 
03130 
03131 
03132     template <>
03133     bool PDelimiter<char>::operator() (char c) const
03134     {
03135         return delimiter == c;
03136     }
03137 
03138     template <>
03139     bool PDelimiter<CTempString>::operator() (char c) const
03140     {
03141         return delimiter.find(c) != NPOS;
03142     }
03143 }
03144 
03145 
03146 string NStr::GetField(const CTempString& str,
03147                       size_t             field_no,
03148                       const CTempString& delimiters,
03149                       EMergeDelims       merge)
03150 {
03151     return s_GetField<PDelimiter<CTempString>, string>
03152         (str,
03153          field_no,
03154          PDelimiter<CTempString>(delimiters),
03155          merge);
03156 }
03157 
03158 
03159 string NStr::GetField(const CTempString& str,
03160                       size_t             field_no,
03161                       char               delimiter,
03162                       EMergeDelims       merge)
03163 {
03164     return s_GetField<PDelimiter<char>, string>
03165         (str,
03166          field_no,
03167          PDelimiter<char>(delimiter),
03168          merge);
03169 }
03170 
03171 
03172 CTempString NStr::GetField_Unsafe(const CTempString& str,
03173                                   size_t             field_no,
03174                                   const CTempString& delimiters,
03175                                   EMergeDelims       merge)
03176 {
03177     return s_GetField<PDelimiter<CTempString>, CTempString>
03178         (str,
03179          field_no,
03180          PDelimiter<CTempString>(delimiters),
03181          merge);
03182 }
03183 
03184 
03185 CTempString NStr::GetField_Unsafe(const CTempString& str,
03186                                   size_t             field_no,
03187                                   char               delimiter,
03188                                   EMergeDelims       merge)
03189 {
03190     return s_GetField<PDelimiter<char>, CTempString>
03191         (str,
03192          field_no,
03193          PDelimiter<char>(delimiter),
03194          merge);
03195 }
03196 
03197 
03198 
03199 /////////////////////////////////////////////////////////////////////////////
03200 //  CStringUTF8
03201 
03202 SIZE_TYPE CStringUTF8::GetSymbolCount(void) const
03203 {
03204     SIZE_TYPE count = 0;
03205     for (const char* src = c_str(); *src; ++src, ++count) {
03206         SIZE_TYPE more = 0;
03207         bool good = x_EvalFirst(*src, more);
03208         while (more-- && good) {
03209             good = x_EvalNext(*(++src));
03210         }
03211         if ( !good ) {
03212             NCBI_THROW2(CStringException, eFormat,
03213                         "String is not in UTF8 format",
03214                         s_DiffPtr(src,c_str()));
03215         }
03216     }
03217     return count;
03218 }
03219 
03220 
03221 SIZE_TYPE CStringUTF8::GetValidSymbolCount(const char* src, SIZE_TYPE buf_size)
03222 {
03223     SIZE_TYPE count = 0, cur_size=0;
03224     for (; cur_size < buf_size && src && *src; ++src, ++count, ++cur_size) {
03225         SIZE_TYPE more = 0;
03226         bool good = x_EvalFirst(*src, more);
03227         while (more-- && good && ++cur_size < buf_size) {
03228             good = x_EvalNext(*(++src));
03229         }
03230         if ( !good ) {
03231             return count;
03232         }
03233     }
03234     return count;
03235 }
03236 
03237 
03238 SIZE_TYPE CStringUTF8::GetValidBytesCount(const char* src, SIZE_TYPE buf_size)
03239 {
03240     SIZE_TYPE count = 0;
03241     SIZE_TYPE cur_size = 0;
03242 
03243     for (; cur_size < buf_size && src && *src; ++src, ++count, ++cur_size) {
03244         SIZE_TYPE more = 0;
03245         bool good = x_EvalFirst(*src, more);
03246         while (more-- && good && cur_size < buf_size) {
03247             good = x_EvalNext(*(++src));
03248             if (good) {
03249                 ++cur_size;
03250             }
03251         }
03252         if ( !good ) {
03253             return cur_size;
03254         }
03255     }
03256     return cur_size;
03257 }
03258 
03259 
03260 string CStringUTF8::AsSingleByteString(EEncoding encoding,
03261     const char* substitute_on_error) const
03262 {
03263     string result;
03264     result.reserve( GetSymbolCount()+1 );
03265     for ( const char* src = c_str(); *src; ++src ) {
03266         TUnicodeSymbol sym = Decode( src );
03267         if (substitute_on_error) {
03268             try {
03269                 result.append(1, SymbolToChar( sym, encoding));
03270             }
03271             catch (CStringException&) {
03272                 result.append(substitute_on_error);
03273             }
03274         } else {
03275             result.append(1, SymbolToChar( sym, encoding));
03276         }
03277     }
03278     return result;
03279 }
03280 
03281 
03282 EEncoding CStringUTF8::GuessEncoding( const char* src)
03283 {
03284     SIZE_TYPE more = 0;
03285     bool cp1252, iso1, ascii, utf8;
03286     for (cp1252 = iso1 = ascii = utf8 = true; *src; ++src) {
03287         Uint1 ch = *src;
03288         bool skip = false;
03289         if (more != 0) {
03290             if (x_EvalNext(ch)) {
03291                 --more;
03292                 if (more == 0) {
03293                     ascii = cp1252 = iso1 = false;
03294                 }
03295                 skip = true;
03296             } else {
03297                 more = 0;
03298                 utf8 = false;
03299             }
03300         }
03301         if (ch > 0x7F) {
03302             ascii = false;
03303             if (ch < 0xA0) {
03304                 iso1 = false;
03305                 if (ch == 0x81 || ch == 0x8D || ch == 0x8F ||
03306                     ch == 0x90 || ch == 0x9D) {
03307                     cp1252 = false;
03308                 }
03309             }
03310             if (!skip && utf8 && !x_EvalFirst(ch, more)) {
03311                 utf8 = false;
03312             }
03313         }
03314     }
03315     if (more != 0) {
03316         utf8 = false;
03317     }
03318     if (ascii) {
03319         return eEncoding_Ascii;
03320     } else if (cp1252) {
03321         return iso1 ? eEncoding_ISO8859_1 : eEncoding_Windows_1252;
03322     } else if (utf8) {
03323         return eEncoding_UTF8;
03324     }
03325     return eEncoding_Unknown;
03326 }
03327 
03328 
03329 bool CStringUTF8::MatchEncoding( const char* src, EEncoding encoding)
03330 {
03331     bool matches = false;
03332     EEncoding enc_src = GuessEncoding(src);
03333     switch ( enc_src ) {
03334     default:
03335     case eEncoding_Unknown:
03336         matches = false;
03337         break;
03338     case eEncoding_Ascii:
03339         matches = true;
03340         break;
03341     case eEncoding_UTF8:
03342     case eEncoding_Windows_1252:
03343         matches = (encoding == enc_src);
03344         break;
03345     case eEncoding_ISO8859_1:
03346         matches = (encoding == enc_src || encoding == eEncoding_Windows_1252);
03347         break;
03348     }
03349     return matches;
03350 }
03351 
03352 
03353 // cp1252, codepoints for chars 0x80 to 0x9F
03354 static const TUnicodeSymbol s_cp1252_table[] = {
03355     0x20AC, 0x003F, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
03356     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x003F, 0x017D, 0x003F,
03357     0x003F, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
03358     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x003F, 0x017E, 0x0178
03359 };
03360 
03361 
03362 TUnicodeSymbol CStringUTF8::CharToSymbol(char c, EEncoding encoding)
03363 {
03364     Uint1 ch = c;
03365     switch (encoding)
03366     {
03367     case eEncoding_Unknown:
03368     case eEncoding_UTF8:
03369         NCBI_THROW2(CStringException, eBadArgs,
03370                     "Unacceptable character encoding", 0);
03371     case eEncoding_Ascii:
03372     case eEncoding_ISO8859_1:
03373         break;
03374     case eEncoding_Windows_1252:
03375         if (ch > 0x7F && ch < 0xA0) {
03376             return s_cp1252_table[ ch - 0x80 ];
03377         }
03378         break;
03379     default:
03380         NCBI_THROW2(CStringException, eBadArgs,
03381                     "Unsupported character encoding", 0);
03382     }
03383     return (TUnicodeSymbol)ch;
03384 }
03385 
03386 
03387 char CStringUTF8::SymbolToChar(TUnicodeSymbol cp, EEncoding encoding)
03388 {
03389     if( encoding == eEncoding_UTF8 || encoding == eEncoding_Unknown) {
03390         NCBI_THROW2(CStringException, eBadArgs,
03391                     "Unacceptable character encoding", 0);
03392     }
03393     if ( cp <= 0xFF) {
03394         return (char)cp;
03395     }
03396     if ( encoding == eEncoding_Windows_1252 ) {
03397         for (Uint1 ch = 0x80; ch <= 0x9F; ++ch) {
03398             if (s_cp1252_table[ ch - 0x80 ] == cp) {
03399                 return (char)ch;
03400             }
03401         }
03402     }
03403     if (cp > 0xFF) {
03404         NCBI_THROW2(CStringException, eConvert,
03405                     "Failed to convert symbol to requested encoding", 0);
03406     }
03407     return (char)cp;
03408 }
03409 
03410 
03411 void CStringUTF8::x_Validate(void) const
03412 {
03413     if (!IsValid()) {
03414         NCBI_THROW2(CStringException, eBadArgs,
03415             "Source string is not in UTF8 format", 0);
03416     }
03417 }
03418 
03419 
03420 void CStringUTF8::x_AppendChar(TUnicodeSymbol c)
03421 {
03422     Uint4 ch = c;
03423     if (ch < 0x80) {
03424         append(1, Uint1(ch));
03425     }
03426     else if (ch < 0x800) {
03427         append(1, Uint1( (ch >>  6)         | 0xC0));
03428         append(1, Uint1( (ch        & 0x3F) | 0x80));
03429     } else if (ch < 0x10000) {
03430         append(1, Uint1( (ch >> 12)         | 0xE0));
03431         append(1, Uint1(((ch >>  6) & 0x3F) | 0x80));
03432         append(1, Uint1(( ch        & 0x3F) | 0x80));
03433     } else {
03434         append(1, Uint1( (ch >> 18)         | 0xF0));
03435         append(1, Uint1(((ch >> 12) & 0x3F) | 0x80));
03436         append(1, Uint1(((ch >>  6) & 0x3F) | 0x80));
03437         append(1, Uint1( (ch        & 0x3F) | 0x80));
03438     }
03439 }
03440 
03441 
03442 void CStringUTF8::x_Append(const char* src,
03443                            EEncoding encoding, EValidate validate)
03444 {
03445     if (encoding == eEncoding_Unknown) {
03446         encoding = GuessEncoding(src);
03447         if (encoding == eEncoding_Unknown) {
03448             NCBI_THROW2(CStringException, eBadArgs,
03449                 "Unable to guess the source string encoding", 0);
03450         }
03451     } else if (validate == eValidate) {
03452         if ( !MatchEncoding( src,encoding ) ) {
03453             NCBI_THROW2(CStringException, eBadArgs,
03454                 "Source string does not match the declared encoding", 0);
03455         }
03456     }
03457     if (encoding == eEncoding_UTF8 || encoding == eEncoding_Ascii) {
03458         append(src);
03459         return;
03460     }
03461 
03462     const char* srcBuf;
03463     SIZE_TYPE needed = 0;
03464     for (srcBuf = src; *srcBuf; ++srcBuf) {
03465         needed += x_BytesNeeded( CharToSymbol( *srcBuf,encoding ) );
03466     }
03467     if ( !needed ) {
03468         return;
03469     }
03470     reserve(max(capacity(),length()+needed+1));
03471     for (srcBuf = src; *srcBuf; ++srcBuf) {
03472         x_AppendChar( CharToSymbol( *srcBuf, encoding ) );
03473     }
03474 }
03475 
03476 
03477 SIZE_TYPE CStringUTF8::x_BytesNeeded(TUnicodeSymbol c)
03478 {
03479     Uint4 ch = c;
03480     if (ch < 0x80) {
03481         return 1;
03482     } else if (ch < 0x800) {
03483         return 2;
03484     } else if (ch < 0x10000) {
03485         return 3;
03486     }
03487     return 4;
03488 }
03489 
03490 
03491 bool CStringUTF8::x_EvalFirst(char ch, SIZE_TYPE& more)
03492 {
03493     more = 0;
03494     if ((ch & 0x80) != 0) {
03495         if ((ch & 0xE0) == 0xC0) {
03496             if ((ch & 0xFE) == 0xC0) {
03497                 // C0 and C1 are not valid UTF-8 chars
03498                 return false;
03499             }
03500             more = 1;
03501         } else if ((ch & 0xF0) == 0xE0) {
03502             more = 2;
03503         } else if ((ch & 0xF8) == 0xF0) {
03504             if ((unsigned char)ch > (unsigned char)0xF4) {
03505                 // F5-FF are not valid UTF-8 chars
03506                 return false;
03507             }
03508             more = 3;
03509         } else {
03510             return false;
03511         }
03512     }
03513     return true;
03514 }
03515 
03516 
03517 bool CStringUTF8::x_EvalNext(char ch)
03518 {
03519     return (ch & 0xC0) == 0x80;
03520 }
03521 
03522 
03523 TUnicodeSymbol CStringUTF8::Decode(const char*& src)
03524 {
03525     TUnicodeSymbol chRes;
03526     SIZE_TYPE more;
03527     Uint1 ch = *src;
03528     if ((ch & 0x80) == 0) {
03529         chRes = ch;
03530         more = 0;
03531     } else if ((ch & 0xE0) == 0xC0) {
03532         chRes = (ch & 0x1F);
03533         more = 1;
03534     } else if ((ch & 0xF0) == 0xE0) {
03535         chRes = (ch & 0x0F);
03536         more = 2;
03537     } else if ((ch & 0xF8) == 0xF0) {
03538         chRes = (ch & 0x07);
03539         more = 3;
03540     } else {
03541         NCBI_THROW2(CStringException, eBadArgs,
03542             "Source string is not in UTF8 format", 0);
03543     }
03544     while (more--) {
03545         ch = *(++src);
03546         if ((ch & 0xC0) != 0x80) {
03547             NCBI_THROW2(CStringException, eBadArgs,
03548                 "Source string is not in UTF8 format", 0);
03549         }
03550         chRes = (chRes << 6) | (ch & 0x3F);
03551     }
03552     return chRes;
03553 }
03554 
03555 
03556 TUnicodeSymbol CStringUTF8::DecodeFirst(char ch, SIZE_TYPE& more)
03557 {
03558     TUnicodeSymbol chRes = 0;
03559     more = 0;
03560     if ((ch & 0x80) == 0) {
03561         chRes = ch;
03562     } else if ((ch & 0xE0) == 0xC0) {
03563         chRes = (ch & 0x1F);
03564         more = 1;
03565     } else if ((ch & 0xF0) == 0xE0) {
03566         chRes = (ch & 0x0F);
03567         more = 2;
03568     } else if ((ch & 0xF8) == 0xF0) {
03569         chRes = (ch & 0x07);
03570         more = 3;
03571     }
03572     return chRes;
03573 }
03574 
03575 
03576 TUnicodeSymbol CStringUTF8::DecodeNext(TUnicodeSymbol chU, char ch)
03577 {
03578     if ((ch & 0xC0) == 0x80) {
03579         return (chU << 6) | (ch & 0x3F);
03580     }
03581     return 0;
03582 }
03583 
03584 
03585 const char* CStringException::GetErrCodeString(void) const
03586 {
03587     switch (GetErrCode()) {
03588     case eConvert:  return "eConvert";
03589     case eBadArgs:  return "eBadArgs";
03590     case eFormat:   return "eFormat";
03591     default:    return CException::GetErrCodeString();
03592     }
03593 }
03594 
03595 
03596 /////////////////////////////////////////////////////////////////////////////
03597 //  CStringPairsParser decoders and encoders
03598 
03599 
03600 CStringDecoder_Url::CStringDecoder_Url(NStr::EUrlDecode flag)
03601     : m_Flag(flag)
03602 {
03603 }
03604 
03605 
03606 string CStringDecoder_Url::Decode(const string& src,
03607                                   EStringType ) const
03608 {
03609     return NStr::URLDecode(src, m_Flag);
03610 }
03611 
03612 
03613 CStringEncoder_Url::CStringEncoder_Url(NStr::EUrlEncode flag)
03614     : m_Flag(flag)
03615 {
03616 }
03617 
03618 
03619 string CStringEncoder_Url::Encode(const string& src,
03620                                   EStringType ) const
03621 {
03622     return NStr::URLEncode(src, m_Flag);
03623 }
03624 
03625 
03626 /////////////////////////////////////////////////////////////////////////////
03627 // CEncodedString --
03628 
03629 CEncodedString::CEncodedString(const string& s,
03630                                NStr::EUrlEncode flag)
03631 {
03632     SetString(s, flag);
03633 }
03634 
03635 
03636 void CEncodedString::SetString(const string& s,
03637                                NStr::EUrlEncode flag)
03638 {
03639     m_Original = s;
03640     if ( NStr::NeedsURLEncoding(s, flag) ) {
03641         if ( m_Encoded.get() ) {
03642             // Do not re-allocate string object
03643             *m_Encoded = NStr::URLEncode(s, flag);
03644         }
03645         else {
03646             m_Encoded.reset(new string(NStr::URLEncode(s, flag)));
03647         }
03648     }
03649     else {
03650         m_Encoded.reset();
03651     }
03652 }
03653 
03654 
03655 /////////////////////////////////////////////////////////////////////////////
03656 //  CTempString (deprecated constructors, defined out of line to cut down
03657 //  on spurious warnings when building with compilers that warn on
03658 //  definition rather than merely, and arguably more sensibly, on usage).
03659 
03660 
03661 CTempString::CTempString(const char* str, size_type pos, size_type len)
03662     : m_String(str+pos), m_Length(len)
03663 {
03664 } // NCBI_FAKE_WARNING
03665 
03666 
03667 CTempString::CTempString(const string& str, size_type len)
03668     : m_String(str.data()), m_Length(min(len, str.size()))
03669 {
03670 } // NCBI_FAKE_WARNING
03671 
03672 
03673 END_NCBI_SCOPE
03674 
03675 

Generated on Sun Dec 6 22:22:38 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:57 2009 by modify_doxy.py rev. 173732