NCBI C++ Toolkit Cross Reference

C++/src/util/dictionary_util.cpp


  1 /*  $Id: dictionary_util.cpp 111184 2007-09-24 21:50:09Z vasilche $
  2  * ===========================================================================
  3  *
  4  *                            PUBLIC DOMAIN NOTICE
  5  *               National Center for Biotechnology Information
  6  *
  7  *  This software/database is a "United States Government Work" under the
  8  *  terms of the United States Copyright Act.  It was written as part of
  9  *  the author's official duties as a United States Government employee and
 10  *  thus cannot be copyrighted.  This software/database is freely available
 11  *  to the public for use. The National Library of Medicine and the U.S.
 12  *  Government have not placed any restriction on its use or reproduction.
 13  *
 14  *  Although all reasonable efforts have been taken to ensure the accuracy
 15  *  and reliability of the software and data, the NLM and the U.S.
 16  *  Government do not and cannot warrant the performance or results that
 17  *  may be obtained by using this software or data. The NLM and the U.S.
 18  *  Government disclaim all warranties, express or implied, including
 19  *  warranties of performance, merchantability or fitness for any particular
 20  *  purpose.
 21  *
 22  *  Please cite the author in any work or product based on this material.
 23  *
 24  * ===========================================================================
 25  *
 26  * Authors:  Mike DiCuccio
 27  *
 28  * File Description:
 29  *
 30  */
 31 
 32 #include <ncbi_pch.hpp>
 33 #include <util/dictionary_util.hpp>
 34 
 35 BEGIN_NCBI_SCOPE
 36 
 37 
 38 // maximum internal size for metaphone computation
 39 // this is used to determine a heap vs stack allocation cutoff in the exact
 40 // edit distance method; as CSimpleDictionary relies heavily on edit distance
 41 // computations, the size of its internal metaphone keys is tuned by this
 42 // parameter.
 43 static const size_t kMaxMetaphoneStack = 10;
 44 
 45 
 46 void CDictionaryUtil::GetMetaphone(const string& in, string* out,
 47                                    size_t max_chars)
 48 {
 49     out->erase();
 50     if (in.empty()) {
 51         return;
 52     }
 53 
 54     ITERATE (string, iter, in) {
 55         size_t prev_len = iter - in.begin();
 56         size_t remaining = in.length() - prev_len - 1;
 57 
 58         if (prev_len  &&
 59             tolower((unsigned char)(*iter)) == tolower((unsigned char)(*(iter - 1)))  &&
 60             tolower((unsigned char)(*iter)) != 'c') {
 61             continue;
 62         }
 63         switch (tolower((unsigned char)(*iter))) {
 64         case 'a':
 65         case 'e':
 66         case 'i':
 67         case 'o':
 68         case 'u':
 69             if ( !prev_len ) {
 70                 *out += 'a';
 71                 ++max_chars;
 72             }
 73             break;
 74 
 75         case 'b':
 76             if (remaining != 0  ||  *(iter - 1) != 'm') {
 77                 *out += 'p';
 78             }
 79             break;
 80 
 81         case 'f':
 82         case 'j':
 83         case 'l':
 84         case 'n':
 85         case 'r':
 86             *out += tolower((unsigned char)(*iter));
 87             break;
 88 
 89         case 'c':
 90             if (remaining > 2  &&
 91                 *(iter + 1) == 'i'  &&
 92                 *(iter + 2) == 'a') {
 93                 *out += 'x';
 94                 iter += 2;
 95                 break;
 96             }
 97 
 98             if (remaining > 1  &&  *(iter + 1) == 'h') {
 99                 *out += 'x';
100                 ++iter;
101                 break;
102             }
103 
104             if (remaining  &&
105                 ( *(iter + 1) == 'e'  ||
106                   *(iter + 1) == 'i'  ||
107                   *(iter + 1) == 'y' ) ) {
108                 *out += 's';
109                 ++iter;
110                 break;
111             }
112 
113             if (remaining  &&  *(iter + 1) == 'k') {
114                 ++iter;
115             }
116             *out += 'k';
117             break;
118 
119         case 'd':
120             if (remaining >= 2  &&  prev_len) {
121                 if ( *(iter + 1) == 'g'  &&
122                      ( *(iter + 2) == 'e'  ||
123                        *(iter + 2) == 'i'  ||
124                        *(iter + 2) == 'y' ) ) {
125                     *out += 'j';
126                     iter += 2;
127                     break;
128                 }
129             }
130             *out += 't';
131             break;
132 
133         case 'g':
134             if (remaining == 1  &&  *(iter + 1) == 'h') {
135                 if (prev_len > 2  &&  ( *(iter - 3) == 'b'  ||
136                                         *(iter - 3) == 'd') ) {
137                     *out += 'k';
138                     ++iter;
139                     break;
140                 }
141 
142                 if (prev_len > 3  &&  *(iter - 3) == 'h') {
143                     *out += 'k';
144                     ++iter;
145                     break;
146                 }
147                 if (prev_len > 4  &&  *(iter - 4) == 'h') {
148                     *out += 'k';
149                     ++iter;
150                     break;
151                 }
152 
153                 *out += 'f';
154                 ++iter;
155                 break;
156             }
157 
158             if (remaining == 1  &&
159                 (*(iter + 1) == 'n'  ||  *(iter + 1) == 'm')) {
160                 ++iter;
161                 break;
162             }
163 
164             if (remaining  &&  !prev_len  &&  *(iter + 1) == 'n') {
165                 ++iter;
166                 *out += 'n';
167                 break;
168             }
169 
170             if (remaining == 3  &&
171                 *(iter + 1) == 'n'  &&
172                 *(iter + 1) == 'e'  &&
173                 *(iter + 1) == 'd') {
174                 iter += 3;
175                 break;
176             }
177 
178             if ( (remaining > 1  &&  *(iter + 1) == 'e')  ||
179                  (remaining  &&  ( *(iter + 1) == 'i'  ||
180                                    *(iter + 1) == 'y' ) ) ) {
181                 *out += 'j';
182                 ++iter;
183                 break;
184             }
185 
186             *out += 'k';
187             break;
188 
189         case 'h':
190             if (remaining  &&  prev_len  &&
191                 ( *(iter + 1) == 'a'  ||
192                   *(iter + 1) == 'e'  ||
193                   *(iter + 1) == 'i'  ||
194                   *(iter + 1) == 'o'  ||
195                   *(iter + 1) == 'u') &&
196                 *(iter - 1) != 'c'  &&
197                 *(iter - 1) != 'g'  &&
198                 *(iter - 1) != 'p'  &&
199                 *(iter - 1) != 's'  &&
200                 *(iter - 1) != 't') {
201                 *out += tolower((unsigned char)(*iter));
202                 ++iter;
203             }
204             break;
205 
206         case 'm':
207         case 'k':
208             if (!prev_len  &&  remaining  &&  *(iter + 1) == 'n') {
209                 ++iter;
210                 *out += 'n';
211                 break;
212             }
213             *out += tolower((unsigned char)(*iter));
214             break;
215 
216         case 'p':
217             if (prev_len == 0  &&  remaining  &&  *(iter + 1) == 'n') {
218                 ++iter;
219                 *out += 'n';
220                 break;
221             }
222             if (remaining  &&  *(iter + 1) == 'h') {
223                 *out += 'f';
224                 break;
225             }
226             *out += tolower((unsigned char)(*iter));
227             break;
228 
229         case 'q':
230             *out += 'k';
231             break;
232 
233         case 's':
234             if (remaining > 2  &&
235                 *(iter + 1) == 'i'  &&
236                 ( *(iter + 2) == 'o'  ||
237                   *(iter + 2) == 'a' ) ) {
238                 iter += 2;
239                 *out += 'x';
240                 break;
241             }
242             if (remaining  &&  *(iter + 1) == 'h') {
243                 *out += 'x';
244                 ++iter;
245                 break;
246             }
247             if (remaining > 2  &&
248                 *(iter + 1) == 'c'  &&
249                 ( *(iter + 2) == 'e'  ||
250                   *(iter + 2) == 'i'  ||
251                   *(iter + 2) == 'y' ) ) {
252                 iter += 2;
253             }
254             *out += 's';
255             break;
256 
257         case 't':
258             if (remaining > 2  &&
259                 *(iter + 1) == 'i'  &&
260                 ( *(iter + 2) == 'o'  ||
261                   *(iter + 2) == 'a' ) ) {
262                 iter += 2;
263                 *out += 'x';
264                 break;
265             }
266             if (remaining  &&  *(iter + 1) == 'h') {
267                 *out += 'o';
268                 ++iter;
269                 break;
270             }
271             *out += tolower((unsigned char)(*iter));
272             break;
273 
274         case 'v':
275             *out += 'f';
276             break;
277 
278         case 'w':
279             if ( !prev_len ) {
280                 if (*(iter + 1) == 'h'  ||
281                     *(iter + 1) == 'r') {
282                     *out += *(iter + 1);
283                     ++iter;
284                     break;
285                 }
286                 *out += tolower((unsigned char)(*iter));
287                 break;
288             }
289 
290             if ( *(iter - 1) == 'a'  ||
291                  *(iter - 1) == 'e'  ||
292                  *(iter - 1) == 'i'  ||
293                  *(iter - 1) == 'o'  ||
294                  *(iter - 1) == 'u') {
295                 *out += tolower((unsigned char)(*iter));
296             }
297             break;
298 
299         case 'x':
300             *out += "ks";
301             break;
302 
303         case 'y':
304             if ( *(iter + 1) == 'a'  ||
305                  *(iter + 1) == 'e'  ||
306                  *(iter + 1) == 'i'  ||
307                  *(iter + 1) == 'o'  ||
308                  *(iter + 1) == 'u') {
309                 break;
310             }
311             if ( *(iter + 1) != 'a'  &&
312                  *(iter + 1) != 'e'  &&
313                  *(iter + 1) != 'i'  &&
314                  *(iter + 1) != 'o'  &&
315                  *(iter + 1) != 'u'  &&
316 
317                  *(iter - 1) != 'a'  &&
318                  *(iter - 1) != 'e'  &&
319                  *(iter - 1) != 'i'  &&
320                  *(iter - 1) != 'o'  &&
321                  *(iter - 1) != 'u') {
322                 break;
323             }
324             *out += tolower((unsigned char)(*iter));
325             break;
326 
327         case 'z':
328             *out += 's';
329             break;
330         }
331 
332         if (out->length() == max_chars) {
333             break;
334         }
335 
336     }
337 
338     //_TRACE("GetMetaphone(): " << in << " -> " << *out);
339 }
340 
341 
342 void CDictionaryUtil::GetSoundex(const string& in, string* out,
343                                  size_t max_chars, char pad_char)
344 {
345     static const char sc_SoundexLut[256] = {
346         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
347         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
348         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
349         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
350         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
351         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
352         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
353         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
354         0x00, 0x00, '1',  '2',  '3',  0x00, '1',  '2', 
355         0x00, 0x00, '2',  '2',  '4',  '5',  '5',  0x00, 
356         '1',  '2',  '6',  '2',  '3',  0x00, '1',  0x00, 
357         '2',  0x00, '2',  0x00, 0x00, 0x00, 0x00, 0x00, 
358         0x00, 0x00, '1',  '2',  '3',  0x00, '1',  '2', 
359         0x00, 0x00, '2',  '2',  '4',  '5',  '5',  0x00, 
360         '1',  '2',  '6',  '2',  '3',  0x00, '1',  0x00, 
361         '2',  0x00, '2',  0x00, 0x00, 0x00, 0x00, 0x00, 
362         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
363         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
364         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
365         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
366         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
367         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
368         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
369         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
370         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
371         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
372         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
373         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
374         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
375         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
376         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
377         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
378     };
379 
380     // basic sanity
381     out->erase();
382     if (in.empty()) {
383         return;
384     }
385 
386     // preserve the first character, in upper case
387     string::const_iterator iter = in.begin();
388     *out += toupper((unsigned char)(*iter));
389 
390     // now, iterate substituting codes, using no more than four characters
391     // total
392     ITERATE (string, iter2, in) {
393         char c = sc_SoundexLut[(int)(unsigned char)*iter2];
394         if (c  &&  *(out->end() - 1) != c) {
395             *out += c;
396             if (out->length() == max_chars) {
397                 break;
398             }
399         }
400     }
401 
402     // pad with our pad character
403     if (out->length() < max_chars) {
404         *out += string(max_chars - out->length(), pad_char);
405     }
406 }
407 
408 
409 size_t CDictionaryUtil::GetEditDistance(const string& str1,
410                                         const string& str2,
411                                         EDistanceMethod method)
412 {
413     switch (method) {
414     case eEditDistance_Similar:
415         {{
416             /// it lgically makes no difference which string
417             /// we look at as the master; we choose the shortest to make
418             /// some of the logic work better (also, it yields more accurate
419             /// results)
420             const string* pstr1 = &str1;
421             const string* pstr2 = &str2;
422             if (pstr1->length() > pstr2->length()) {
423                 swap(pstr1, pstr2);
424             }
425             size_t dist = 0;
426             string::const_iterator iter1 = pstr1->begin();
427             string::const_iterator iter2 = pstr2->begin();
428             for ( ;  iter1 != pstr1->end()  &&  iter2 != pstr2->end();  ) {
429                 char c1_0 = tolower((unsigned char)(*iter1));
430                 char c2_0 = tolower((unsigned char)(*iter2));
431                 if (c1_0 == c2_0) {
432                     /// identity: easy out
433                     ++iter1;
434                     ++iter2;
435                     continue;
436                 }
437 
438                 /// we scan for a match, starting from the corner formed
439                 /// as we march forward a few letters.  We use a maximum
440                 /// of 3 letters as our limit
441                 int max_radius = (int)min(pstr1->end() - iter1,
442                                           string::difference_type(3));
443 
444                 string::const_iterator best_iter1 = iter1 + 1;
445                 string::const_iterator best_iter2 = iter2 + 1;
446                 size_t cost = 1;
447 
448                 for (int radius = 1;  radius <= max_radius;  ++radius) {
449 
450                     char corner1 = *(iter1 + radius);
451                     char corner2 = *(iter2 + radius);
452                     bool match = false;
453                     for (int i = radius;  i >= 0;  --i) {
454                         c1_0 = tolower((unsigned char)(*(iter1 + i)));
455                         c2_0 = tolower((unsigned char)(*(iter2 + i)));
456                         if (c1_0 == corner2) {
457                             match = true;
458                             cost = radius;
459                             best_iter1 = iter1 + i;
460                             best_iter2 = iter2 + radius;
461                             break;
462                         }
463                         if (c2_0 == corner1) {
464                             match = true;
465                             cost = radius;
466                             best_iter1 = iter1 + radius;
467                             best_iter2 = iter2 + i;
468                             break;
469                         }
470                     }
471                     if (match) {
472                         break;
473                     }
474                 }
475 
476                 dist += cost;
477                 iter1 = best_iter1;
478                 iter2 = best_iter2;
479             }
480             dist += (pstr1->end() - iter1) + (pstr2->end() - iter2);
481             return dist;
482          }}
483 
484     case eEditDistance_Exact:
485         {{
486             size_t buf0[kMaxMetaphoneStack + 1];
487             size_t buf1[kMaxMetaphoneStack + 1];
488             vector<size_t> row0;
489             vector<size_t> row1;
490 
491             const string* short_str = &str1;
492             const string* long_str = &str2;
493             if (str2.length() < str1.length()) {
494                 swap(short_str, long_str);
495             }
496 
497             size_t* row0_ptr = buf0;
498             size_t* row1_ptr = buf1;
499             if (short_str->size() > kMaxMetaphoneStack) {
500                 row0.resize(short_str->size() + 1);
501                 row1.resize(short_str->size() + 1);
502                 row0_ptr = &row0[0];
503                 row1_ptr = &row1[0];
504             }
505 
506             size_t i;
507             size_t j;
508 
509             //cout << "   ";
510             for (i = 0;  i < short_str->size() + 1;  ++i) {
511                 //cout << (*short_str)[i] << "  ";
512                 row0_ptr[i] = i;
513                 row1_ptr[i] = i;
514             }
515             //cout << endl;
516 
517             for (i = 0;  i < long_str->size();  ++i) {
518                 row1_ptr[0] = i + 1;
519                 //cout << (*long_str)[i] << " ";
520                 for (j = 0;  j < short_str->size();  ++j) {
521                     int c0 = tolower((unsigned char) (*short_str)[j]);
522                     int c1 = tolower((unsigned char) (*long_str)[i]);
523                     size_t cost = (c0 == c1 ? 0 : 1);
524                     row1_ptr[j + 1] =
525                         min(row0_ptr[j] + cost,
526                             min(row0_ptr[j + 1] + 1, row1_ptr[j] + 1));
527                     //cout << setw(2) << row1_ptr[j + 1] << " ";
528                 }
529 
530                 //cout << endl;
531 
532                 swap(row0_ptr, row1_ptr);
533             }
534 
535             return row0_ptr[short_str->size()];
536          }}
537     }
538 
539     // undefined
540     return (size_t)-1;
541 }
542 
543 
544 /// Compute a nearness score for two different words or phrases
545 int CDictionaryUtil::Score(const string& word1, const string& word2,
546                            size_t max_metaphone)
547 {
548     string meta1;
549     string meta2;
550     GetMetaphone(word1, &meta1, max_metaphone);
551     GetMetaphone(word2, &meta2, max_metaphone);
552     return Score(word1, meta1, word2, meta2);
553 }
554 
555 
556 /// Compute a nearness score based on metaphone as well as raw distance
557 int CDictionaryUtil::Score(const string& word1, const string& meta1,
558                            const string& word2, const string& meta2,
559                            EDistanceMethod method)
560 {
561     // score:
562     // start with edit distance
563     size_t score = CDictionaryUtil::GetEditDistance(word1, word2, method);
564 
565     // normalize to length of word
566     // this allows negative scores to be omittied
567     score = word1.length() - score;
568 
569     // down-weight for metaphone distance
570     score -= CDictionaryUtil::GetEditDistance(meta1, meta2, method);
571 
572     // one point for first letter of word being same
573     //score += (tolower((unsigned char) word1[0]) == tolower((unsigned char) word2[0]));
574 
575     // one point for identical lengths of words
576     //score += (word1.length() == word2.length());
577 
578     return (int)score;
579 }
580 
581 
582 /////////////////////////////////////////////////////////////////////////////
583 ///
584 /// Porter's Stemming Algorithm
585 ///
586 
587 enum ECharType {
588     eOther,
589     eConsonant,
590     eVowel
591 };
592 
593 static ECharType s_char_type[256];
594 
595 struct SFillTypes
596 {
597     SFillTypes()
598     {
599         // This cycle is processed in backward order to avoid buggy
600         // optimization by ICC 9.1 on 64-bit platforms.
601         // The optimizer calls buggy intel_fast_mem(cpy|set) even with
602         // -fno-builtin-memset -fno-builtin-memcpy.
603         for (int i = 256;  i--; ) {
604             s_char_type[i] = eOther;
605         }
606 
607         for (int i = 0;  i < 26;  ++i) {
608             s_char_type[i + 'a'] = eConsonant;
609             s_char_type[i + 'A'] = eConsonant;
610         }
611 
612         s_char_type[(int)'a'] = eVowel;
613         s_char_type[(int)'e'] = eVowel;
614         s_char_type[(int)'i'] = eVowel;
615         s_char_type[(int)'o'] = eVowel;
616         s_char_type[(int)'u'] = eVowel;
617     }
618 };
619 static SFillTypes s_AutoFiller;
620 
621 static inline ECharType s_GetCharType(int c)
622 {
623     _ASSERT(c < 256  &&  c >= 0);
624     return s_char_type[c];
625 }
626 
627 static inline int s_MeasureWord(string::const_iterator iter,
628                                 string::const_iterator end)
629 {
630     int m = 0;
631 
632     /**
633     {{
634          ECharType first_char_type = s_GetCharType(*iter);
635          while (iter != end  &&  s_GetCharType(*iter) == first_char_type) {
636              ++iter;
637          }
638      }}
639      **/
640 
641 
642     // skip leading entities
643     ECharType prev_type = s_GetCharType(*iter);
644     for ( ;  iter != end;  ++iter) {
645         ECharType type = s_GetCharType(*iter);
646         if (type != prev_type) {
647             prev_type = type;
648             break;
649         }
650     }
651 
652     //prev_type = s_GetCharType(*iter);
653     for ( ;  iter != end;  ++iter) {
654         ECharType type = s_GetCharType(*iter);
655         if (type != prev_type) {
656             prev_type = type;
657             ++m;
658         }
659     }
660     /**
661     for ( ;  iter != end;  ++m) {
662         string::const_iterator prev(iter);
663         ECharType prev_type = s_GetCharType(*prev);
664         for (++iter;  iter != end;  ) {
665             ECharType type = s_GetCharType(*iter);
666             if (type != prev_type) {
667                 break;
668             }
669             prev_type = type;
670             prev = iter++;
671         }
672         if (iter != end) {
673             prev = iter;
674             prev_type = s_GetCharType(*prev);
675             for (++iter;  iter != end;  ) {
676                 ECharType type = s_GetCharType(*iter);
677                 if (type != prev_type) {
678                     break;
679                 }
680                 prev_type = type;
681                 prev = iter++;
682             }
683         }
684     }
685     **/
686 
687     return m;
688 }
689 
690 
691 static inline bool s_EndsWith(const string& str1, const string& str2)
692 {
693     string::const_reverse_iterator iter1(str1.end());
694     string::const_reverse_iterator end1 (str1.begin());
695     string::const_reverse_iterator iter2(str2.end());
696     string::const_reverse_iterator end2 (str2.begin());
697     for ( ;  iter1 != end1  &&  iter2 != end2;  ++iter1, ++iter2) {
698         if (*iter1 != *iter2) {
699             return false;
700         }
701     }
702     return true;
703 }
704 
705 static inline bool s_EndsWith(const string& str1, const char* p)
706 {
707     string::const_reverse_iterator iter1(str1.end());
708     string::const_reverse_iterator end1 (str1.begin());
709     const char* iter2 = p + strlen(p) - 1;
710     const char* end2  = p - 1;
711     for ( ;  iter1 != end1  &&  iter2 != end2;  ++iter1, --iter2) {
712         if (*iter1 != *iter2) {
713             return false;
714         }
715     }
716     return true;
717 }
718 
719 static string::size_type s_FindFirstVowel(const string& str)
720 {
721     for (string::size_type i = 0;  i < str.size();  ++i) {
722         if (s_GetCharType(str[i]) == eVowel) {
723             return i;
724         }
725     }
726     return string::npos;
727 }
728 
729 static inline bool s_ReplaceEnding(string& word,
730                                    const string& match,
731                                    const string& substitute,
732                                    int min_measure = 0)
733 {
734     if (word.length() < match.length()) {
735         return false;
736     }
737 
738     if ( !s_EndsWith(word, match) ) {
739         return false;
740     }
741 
742     if (s_MeasureWord(word.begin(),
743                       word.end() - match.length()) <= min_measure) {
744         return false;
745     }
746 
747     word.erase(word.length() - match.length());
748     word += substitute;
749     return true;
750 }
751 
752 
753 static inline bool s_ReplaceEnding(string& word,
754                                    const char* match,
755                                    const char* substitute,
756                                    int min_measure = 0)
757 {
758     size_t match_len = strlen(match);
759     if (word.length() < match_len) {
760         return false;
761     }
762 
763     if ( !s_EndsWith(word, match) ) {
764         return false;
765     }
766 
767     if (s_MeasureWord(word.begin(),
768                       word.end() - match_len) <= min_measure) {
769         return false;
770     }
771 
772     word.erase(word.length() - match_len);
773     word += substitute;
774     return true;
775 }
776 
777 
778 void CDictionaryUtil::Stem(const string& in_str, string* out_str)
779 {
780     *out_str = in_str;
781     string& str = *out_str;
782     //NStr::ToLower(str);
783 
784     // step 1a: common 's' endings
785     //
786     // sses -> ss
787     // ies  -> i
788     // ss   -> ss
789     // s    ->
790     if (str[ str.length()-1 ] == 's') {
791         do {
792             if (s_ReplaceEnding(str, "sses", "ss")) {
793                 break;
794             }
795 
796             if (s_ReplaceEnding(str, "ies", "i")) {
797                 break;
798             }
799 
800             if ( !s_EndsWith(str, "ss") ) {
801                 s_ReplaceEnding(str, "s", "");
802             }
803         }
804         while (false);
805     }
806 
807     // step 1b: ed/ing
808     //
809     // eed -> ed (not .eed)
810     // ed  ->    (*v*)
811     // ing ->    (*v*)
812     if (s_EndsWith(str, "eed")  &&  str.length() > 4) {
813         str.erase(str.length() - 1);
814     } else {
815         bool extra = false;
816         if (s_EndsWith(str, "ed")  &&  
817             s_FindFirstVowel(str) < str.length() - 3) {
818             str.erase(str.length() - 2);
819             extra = true;
820         } else if (s_EndsWith(str, "ing")  &&
821                    s_FindFirstVowel(str) < str.length() - 3) {
822             str.erase(str.length() - 3);
823             extra = true;
824         }
825 
826         if (extra) {
827             if (s_EndsWith(str, "at")  ||
828                 s_EndsWith(str, "bl")  ||
829                 s_EndsWith(str, "iz")) {
830                 str += 'e';
831             } else if (str[str.length() - 1] != 'l'  &&
832                        str[str.length() - 1] != 's'  &&
833                        str[str.length() - 1] != 'z'  &&
834                        str[str.length() - 1]  == str[str.length() - 2]) {
835                 str.erase(str.length() - 1);
836             } else if (str.length() == 3  &&
837                        s_GetCharType(str[0]) == eConsonant  &&
838                        s_GetCharType(str[1]) == eVowel  &&
839                        s_GetCharType(str[2]) == eConsonant) {
840                 str += 'e';
841             }
842         }
843     }
844 
845     // step 1c: y -> i
846     if (str[str.length() - 1] == 'y'  &&
847         s_FindFirstVowel(str) < str.length() - 1) {
848         str[str.length() - 1] = 'i';
849     }
850 
851     // step 2
852 
853     if (str.length() > 3) {
854         switch (str[ str.length() - 2 ]) {
855         case 'a':
856             if ( !s_ReplaceEnding(str, "ational", "ate") ) {
857                 s_ReplaceEnding(str, "tional", "tion");
858             }
859             break;
860 
861         case 'c':
862             if ( !s_ReplaceEnding(str, "enci", "ence") ) {
863                 s_ReplaceEnding(str, "anci", "ance");
864             }
865             break;
866 
867         case 'e':
868             s_ReplaceEnding(str, "izer", "ize");
869             break;
870 
871         case 'l':
872             if (str[ str.length()-1 ] == 'i') {
873                 if ( !s_ReplaceEnding(str, "abli", "able") ) {
874                     if ( !s_ReplaceEnding(str, "alli", "al") ) {
875                         if ( !s_ReplaceEnding(str, "entli", "ent") ) {
876                             if ( !s_ReplaceEnding(str, "eli", "e") ) {
877                                 s_ReplaceEnding(str, "ousli", "ous");
878                             }
879                         }
880                     }
881                 }
882             }
883             break;
884 
885         case 'o':
886             if ( !s_ReplaceEnding(str, "ization", "ize") ) {
887                 if ( !s_ReplaceEnding(str, "ation", "ate") ) {
888                     s_ReplaceEnding(str, "ator", "ate");
889                 }
890             }
891             break;
892 
893         case 's':
894             if ( !s_ReplaceEnding(str, "alism", "al") ) {
895                 if ( !s_ReplaceEnding(str, "iveness", "ive") ) {
896                     if ( !s_ReplaceEnding(str, "fulness", "ful") ) {
897                         s_ReplaceEnding(str, "ousness", "ous");
898                     }
899                 }
900             }
901             break;
902 
903         case 't':
904             if ( !s_ReplaceEnding(str, "aliti", "al") ) {
905                 if ( !s_ReplaceEnding(str, "iviti", "ive") ) {
906                     s_ReplaceEnding(str, "biliti", "ble");
907                 }
908             }
909             break;
910 
911         default:
912             break;
913         }
914     }
915 
916     // 'us' endings
917     //s_ReplaceEnding(str, "u", "us");
918 
919     // step 3
920     typedef pair<const char*, const char*> TReplace;
921     static const TReplace rep_step3[] = {
922         TReplace("icate", "ic"),
923         TReplace("ative", ""),
924         TReplace("alize", "al"),
925         TReplace("iciti", "ic"),
926         TReplace("ical", "ic"),
927         TReplace("ful", ""),
928         TReplace("ness", ""),
929 
930         /// end
931         TReplace(NULL, NULL)
932     };
933     {{
934          static string s_Step3_Endings("eils");
935          if (s_Step3_Endings.find_first_of(str[ str.length()-1 ]) != string::npos) {
936              for (const TReplace* p = rep_step3;  p->first;  ++p) {
937                  if (s_ReplaceEnding(str, p->first, p->second)) {
938                      break;
939                  }
940              }
941          }
942      }}
943 
944 
945     // step 4
946     if (str.length() > 2) {
947         switch (str[ str.length() - 2]) {
948         case 'a':
949             if (str[ str.length()-1 ] == 'l') {
950                 if (s_ReplaceEnding(str, "ual", "", 1)) {
951                     break;
952                 }
953                 if (s_ReplaceEnding(str, "ial", "", 1)) {
954                     break;
955                 }
956                 s_ReplaceEnding(str, "al", "", 1);
957             }
958             break;
959 
960         case 'c':
961             if (str[ str.length()-1 ] == 'e') {
962                 if ( !s_ReplaceEnding(str, "ance", "", 1) ) {
963                     s_ReplaceEnding(str, "ence", "", 1);
964                 }
965             }
966             break;
967 
968         case 'e':
969             s_ReplaceEnding(str, "er", "", 1);
970             break;
971 
972         case 'i':
973             if (s_ReplaceEnding(str, "ix", "ic", 0)) {
974                 break;
975             }
976             s_ReplaceEnding(str, "ic", "", 1);
977             break;
978 
979         case 'l':
980             if ( !s_ReplaceEnding(str, "able", "", 1) ) {
981                 s_ReplaceEnding(str, "ible", "", 1);
982             }
983             break;
984 
985         case 'n':
986             if ( !s_ReplaceEnding(str, "ant", "", 1) ) {
987                 if ( !s_ReplaceEnding(str, "ement", "", 1) ) {
988                     if ( !s_ReplaceEnding(str, "ment", "", 1) ) {
989                         s_ReplaceEnding(str, "ent", "", 1);
990                     }
991                 }
992             }
993             break;
994 
995         case 'o':
996             if ( !s_ReplaceEnding(str, "sion", "s", 1) ) {
997                 if ( !s_ReplaceEnding(str, "tion", "t", 1) ) {
998                     s_ReplaceEnding(str, "ou", "", 1);
999                 }
1000             }
1001             break;
1002 
1003         case 's':
1004             s_ReplaceEnding(str, "ism", "", 1);
1005             break;
1006 
1007         case 't':
1008             if ( !s_ReplaceEnding(str, "ate", "", 1) ) {
1009                 s_ReplaceEnding(str, "iti", "", 1);
1010             }
1011             break;
1012 
1013         case 'u':
1014             s_ReplaceEnding(str, "ous", "", 1);
1015             break;
1016 
1017         case 'v':
1018             s_ReplaceEnding(str, "ive", "", 1);
1019             break;
1020 
1021         case 'z':
1022             s_ReplaceEnding(str, "ize", "", 1);
1023             break;
1024         }
1025     }
1026 
1027     // step 5a
1028     s_ReplaceEnding(str, "e", "", 1);
1029 
1030     // step 5b
1031     if (s_MeasureWord(str.begin(), str.end()) > 1  &&
1032         str[str.length() - 1] == 'l'  &&
1033         str[str.length() - 2] == 'l') {
1034         str.erase(str.length() - 1);
1035     }
1036 
1037 }
1038 
1039 
1040 END_NCBI_SCOPE
1041 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.