|
NCBI Home IEB Home C++ Toolkit docs C Toolkit source browser C Toolkit source browser (2) |
NCBI C++ Toolkit Cross ReferenceC++/src/util/dictionary_util.cpp |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: dictionary_util.cpp 111184 2007-09-24 21:50:09Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <util/dictionary_util.hpp>
34
35 BEGIN_NCBI_SCOPE
36
37
38 // maximum internal size for metaphone computation
39 // this is used to determine a heap vs stack allocation cutoff in the exact
40 // edit distance method; as CSimpleDictionary relies heavily on edit distance
41 // computations, the size of its internal metaphone keys is tuned by this
42 // parameter.
43 static const size_t kMaxMetaphoneStack = 10;
44
45
46 void CDictionaryUtil::GetMetaphone(const string& in, string* out,
47 size_t max_chars)
48 {
49 out->erase();
50 if (in.empty()) {
51 return;
52 }
53
54 ITERATE (string, iter, in) {
55 size_t prev_len = iter - in.begin();
56 size_t remaining = in.length() - prev_len - 1;
57
58 if (prev_len &&
59 tolower((unsigned char)(*iter)) == tolower((unsigned char)(*(iter - 1))) &&
60 tolower((unsigned char)(*iter)) != 'c') {
61 continue;
62 }
63 switch (tolower((unsigned char)(*iter))) {
64 case 'a':
65 case 'e':
66 case 'i':
67 case 'o':
68 case 'u':
69 if ( !prev_len ) {
70 *out += 'a';
71 ++max_chars;
72 }
73 break;
74
75 case 'b':
76 if (remaining != 0 || *(iter - 1) != 'm') {
77 *out += 'p';
78 }
79 break;
80
81 case 'f':
82 case 'j':
83 case 'l':
84 case 'n':
85 case 'r':
86 *out += tolower((unsigned char)(*iter));
87 break;
88
89 case 'c':
90 if (remaining > 2 &&
91 *(iter + 1) == 'i' &&
92 *(iter + 2) == 'a') {
93 *out += 'x';
94 iter += 2;
95 break;
96 }
97
98 if (remaining > 1 && *(iter + 1) == 'h') {
99 *out += 'x';
100 ++iter;
101 break;
102 }
103
104 if (remaining &&
105 ( *(iter + 1) == 'e' ||
106 *(iter + 1) == 'i' ||
107 *(iter + 1) == 'y' ) ) {
108 *out += 's';
109 ++iter;
110 break;
111 }
112
113 if (remaining && *(iter + 1) == 'k') {
114 ++iter;
115 }
116 *out += 'k';
117 break;
118
119 case 'd':
120 if (remaining >= 2 && prev_len) {
121 if ( *(iter + 1) == 'g' &&
122 ( *(iter + 2) == 'e' ||
123 *(iter + 2) == 'i' ||
124 *(iter + 2) == 'y' ) ) {
125 *out += 'j';
126 iter += 2;
127 break;
128 }
129 }
130 *out += 't';
131 break;
132
133 case 'g':
134 if (remaining == 1 && *(iter + 1) == 'h') {
135 if (prev_len > 2 && ( *(iter - 3) == 'b' ||
136 *(iter - 3) == 'd') ) {
137 *out += 'k';
138 ++iter;
139 break;
140 }
141
142 if (prev_len > 3 && *(iter - 3) == 'h') {
143 *out += 'k';
144 ++iter;
145 break;
146 }
147 if (prev_len > 4 && *(iter - 4) == 'h') {
148 *out += 'k';
149 ++iter;
150 break;
151 }
152
153 *out += 'f';
154 ++iter;
155 break;
156 }
157
158 if (remaining == 1 &&
159 (*(iter + 1) == 'n' || *(iter + 1) == 'm')) {
160 ++iter;
161 break;
162 }
163
164 if (remaining && !prev_len && *(iter + 1) == 'n') {
165 ++iter;
166 *out += 'n';
167 break;
168 }
169
170 if (remaining == 3 &&
171 *(iter + 1) == 'n' &&
172 *(iter + 1) == 'e' &&
173 *(iter + 1) == 'd') {
174 iter += 3;
175 break;
176 }
177
178 if ( (remaining > 1 && *(iter + 1) == 'e') ||
179 (remaining && ( *(iter + 1) == 'i' ||
180 *(iter + 1) == 'y' ) ) ) {
181 *out += 'j';
182 ++iter;
183 break;
184 }
185
186 *out += 'k';
187 break;
188
189 case 'h':
190 if (remaining && prev_len &&
191 ( *(iter + 1) == 'a' ||
192 *(iter + 1) == 'e' ||
193 *(iter + 1) == 'i' ||
194 *(iter + 1) == 'o' ||
195 *(iter + 1) == 'u') &&
196 *(iter - 1) != 'c' &&
197 *(iter - 1) != 'g' &&
198 *(iter - 1) != 'p' &&
199 *(iter - 1) != 's' &&
200 *(iter - 1) != 't') {
201 *out += tolower((unsigned char)(*iter));
202 ++iter;
203 }
204 break;
205
206 case 'm':
207 case 'k':
208 if (!prev_len && remaining && *(iter + 1) == 'n') {
209 ++iter;
210 *out += 'n';
211 break;
212 }
213 *out += tolower((unsigned char)(*iter));
214 break;
215
216 case 'p':
217 if (prev_len == 0 && remaining && *(iter + 1) == 'n') {
218 ++iter;
219 *out += 'n';
220 break;
221 }
222 if (remaining && *(iter + 1) == 'h') {
223 *out += 'f';
224 break;
225 }
226 *out += tolower((unsigned char)(*iter));
227 break;
228
229 case 'q':
230 *out += 'k';
231 break;
232
233 case 's':
234 if (remaining > 2 &&
235 *(iter + 1) == 'i' &&
236 ( *(iter + 2) == 'o' ||
237 *(iter + 2) == 'a' ) ) {
238 iter += 2;
239 *out += 'x';
240 break;
241 }
242 if (remaining && *(iter + 1) == 'h') {
243 *out += 'x';
244 ++iter;
245 break;
246 }
247 if (remaining > 2 &&
248 *(iter + 1) == 'c' &&
249 ( *(iter + 2) == 'e' ||
250 *(iter + 2) == 'i' ||
251 *(iter + 2) == 'y' ) ) {
252 iter += 2;
253 }
254 *out += 's';
255 break;
256
257 case 't':
258 if (remaining > 2 &&
259 *(iter + 1) == 'i' &&
260 ( *(iter + 2) == 'o' ||
261 *(iter + 2) == 'a' ) ) {
262 iter += 2;
263 *out += 'x';
264 break;
265 }
266 if (remaining && *(iter + 1) == 'h') {
267 *out += 'o';
268 ++iter;
269 break;
270 }
271 *out += tolower((unsigned char)(*iter));
272 break;
273
274 case 'v':
275 *out += 'f';
276 break;
277
278 case 'w':
279 if ( !prev_len ) {
280 if (*(iter + 1) == 'h' ||
281 *(iter + 1) == 'r') {
282 *out += *(iter + 1);
283 ++iter;
284 break;
285 }
286 *out += tolower((unsigned char)(*iter));
287 break;
288 }
289
290 if ( *(iter - 1) == 'a' ||
291 *(iter - 1) == 'e' ||
292 *(iter - 1) == 'i' ||
293 *(iter - 1) == 'o' ||
294 *(iter - 1) == 'u') {
295 *out += tolower((unsigned char)(*iter));
296 }
297 break;
298
299 case 'x':
300 *out += "ks";
301 break;
302
303 case 'y':
304 if ( *(iter + 1) == 'a' ||
305 *(iter + 1) == 'e' ||
306 *(iter + 1) == 'i' ||
307 *(iter + 1) == 'o' ||
308 *(iter + 1) == 'u') {
309 break;
310 }
311 if ( *(iter + 1) != 'a' &&
312 *(iter + 1) != 'e' &&
313 *(iter + 1) != 'i' &&
314 *(iter + 1) != 'o' &&
315 *(iter + 1) != 'u' &&
316
317 *(iter - 1) != 'a' &&
318 *(iter - 1) != 'e' &&
319 *(iter - 1) != 'i' &&
320 *(iter - 1) != 'o' &&
321 *(iter - 1) != 'u') {
322 break;
323 }
324 *out += tolower((unsigned char)(*iter));
325 break;
326
327 case 'z':
328 *out += 's';
329 break;
330 }
331
332 if (out->length() == max_chars) {
333 break;
334 }
335
336 }
337
338 //_TRACE("GetMetaphone(): " << in << " -> " << *out);
339 }
340
341
342 void CDictionaryUtil::GetSoundex(const string& in, string* out,
343 size_t max_chars, char pad_char)
344 {
345 static const char sc_SoundexLut[256] = {
346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
349 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
350 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
351 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
354 0x00, 0x00, '1', '2', '3', 0x00, '1', '2',
355 0x00, 0x00, '2', '2', '4', '5', '5', 0x00,
356 '1', '2', '6', '2', '3', 0x00, '1', 0x00,
357 '2', 0x00, '2', 0x00, 0x00, 0x00, 0x00, 0x00,
358 0x00, 0x00, '1', '2', '3', 0x00, '1', '2',
359 0x00, 0x00, '2', '2', '4', '5', '5', 0x00,
360 '1', '2', '6', '2', '3', 0x00, '1', 0x00,
361 '2', 0x00, '2', 0x00, 0x00, 0x00, 0x00, 0x00,
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
367 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
369 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
373 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
374 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
375 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
376 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
377 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
378 };
379
380 // basic sanity
381 out->erase();
382 if (in.empty()) {
383 return;
384 }
385
386 // preserve the first character, in upper case
387 string::const_iterator iter = in.begin();
388 *out += toupper((unsigned char)(*iter));
389
390 // now, iterate substituting codes, using no more than four characters
391 // total
392 ITERATE (string, iter2, in) {
393 char c = sc_SoundexLut[(int)(unsigned char)*iter2];
394 if (c && *(out->end() - 1) != c) {
395 *out += c;
396 if (out->length() == max_chars) {
397 break;
398 }
399 }
400 }
401
402 // pad with our pad character
403 if (out->length() < max_chars) {
404 *out += string(max_chars - out->length(), pad_char);
405 }
406 }
407
408
409 size_t CDictionaryUtil::GetEditDistance(const string& str1,
410 const string& str2,
411 EDistanceMethod method)
412 {
413 switch (method) {
414 case eEditDistance_Similar:
415 {{
416 /// it lgically makes no difference which string
417 /// we look at as the master; we choose the shortest to make
418 /// some of the logic work better (also, it yields more accurate
419 /// results)
420 const string* pstr1 = &str1;
421 const string* pstr2 = &str2;
422 if (pstr1->length() > pstr2->length()) {
423 swap(pstr1, pstr2);
424 }
425 size_t dist = 0;
426 string::const_iterator iter1 = pstr1->begin();
427 string::const_iterator iter2 = pstr2->begin();
428 for ( ; iter1 != pstr1->end() && iter2 != pstr2->end(); ) {
429 char c1_0 = tolower((unsigned char)(*iter1));
430 char c2_0 = tolower((unsigned char)(*iter2));
431 if (c1_0 == c2_0) {
432 /// identity: easy out
433 ++iter1;
434 ++iter2;
435 continue;
436 }
437
438 /// we scan for a match, starting from the corner formed
439 /// as we march forward a few letters. We use a maximum
440 /// of 3 letters as our limit
441 int max_radius = (int)min(pstr1->end() - iter1,
442 string::difference_type(3));
443
444 string::const_iterator best_iter1 = iter1 + 1;
445 string::const_iterator best_iter2 = iter2 + 1;
446 size_t cost = 1;
447
448 for (int radius = 1; radius <= max_radius; ++radius) {
449
450 char corner1 = *(iter1 + radius);
451 char corner2 = *(iter2 + radius);
452 bool match = false;
453 for (int i = radius; i >= 0; --i) {
454 c1_0 = tolower((unsigned char)(*(iter1 + i)));
455 c2_0 = tolower((unsigned char)(*(iter2 + i)));
456 if (c1_0 == corner2) {
457 match = true;
458 cost = radius;
459 best_iter1 = iter1 + i;
460 best_iter2 = iter2 + radius;
461 break;
462 }
463 if (c2_0 == corner1) {
464 match = true;
465 cost = radius;
466 best_iter1 = iter1 + radius;
467 best_iter2 = iter2 + i;
468 break;
469 }
470 }
471 if (match) {
472 break;
473 }
474 }
475
476 dist += cost;
477 iter1 = best_iter1;
478 iter2 = best_iter2;
479 }
480 dist += (pstr1->end() - iter1) + (pstr2->end() - iter2);
481 return dist;
482 }}
483
484 case eEditDistance_Exact:
485 {{
486 size_t buf0[kMaxMetaphoneStack + 1];
487 size_t buf1[kMaxMetaphoneStack + 1];
488 vector<size_t> row0;
489 vector<size_t> row1;
490
491 const string* short_str = &str1;
492 const string* long_str = &str2;
493 if (str2.length() < str1.length()) {
494 swap(short_str, long_str);
495 }
496
497 size_t* row0_ptr = buf0;
498 size_t* row1_ptr = buf1;
499 if (short_str->size() > kMaxMetaphoneStack) {
500 row0.resize(short_str->size() + 1);
501 row1.resize(short_str->size() + 1);
502 row0_ptr = &row0[0];
503 row1_ptr = &row1[0];
504 }
505
506 size_t i;
507 size_t j;
508
509 //cout << " ";
510 for (i = 0; i < short_str->size() + 1; ++i) {
511 //cout << (*short_str)[i] << " ";
512 row0_ptr[i] = i;
513 row1_ptr[i] = i;
514 }
515 //cout << endl;
516
517 for (i = 0; i < long_str->size(); ++i) {
518 row1_ptr[0] = i + 1;
519 //cout << (*long_str)[i] << " ";
520 for (j = 0; j < short_str->size(); ++j) {
521 int c0 = tolower((unsigned char) (*short_str)[j]);
522 int c1 = tolower((unsigned char) (*long_str)[i]);
523 size_t cost = (c0 == c1 ? 0 : 1);
524 row1_ptr[j + 1] =
525 min(row0_ptr[j] + cost,
526 min(row0_ptr[j + 1] + 1, row1_ptr[j] + 1));
527 //cout << setw(2) << row1_ptr[j + 1] << " ";
528 }
529
530 //cout << endl;
531
532 swap(row0_ptr, row1_ptr);
533 }
534
535 return row0_ptr[short_str->size()];
536 }}
537 }
538
539 // undefined
540 return (size_t)-1;
541 }
542
543
544 /// Compute a nearness score for two different words or phrases
545 int CDictionaryUtil::Score(const string& word1, const string& word2,
546 size_t max_metaphone)
547 {
548 string meta1;
549 string meta2;
550 GetMetaphone(word1, &meta1, max_metaphone);
551 GetMetaphone(word2, &meta2, max_metaphone);
552 return Score(word1, meta1, word2, meta2);
553 }
554
555
556 /// Compute a nearness score based on metaphone as well as raw distance
557 int CDictionaryUtil::Score(const string& word1, const string& meta1,
558 const string& word2, const string& meta2,
559 EDistanceMethod method)
560 {
561 // score:
562 // start with edit distance
563 size_t score = CDictionaryUtil::GetEditDistance(word1, word2, method);
564
565 // normalize to length of word
566 // this allows negative scores to be omittied
567 score = word1.length() - score;
568
569 // down-weight for metaphone distance
570 score -= CDictionaryUtil::GetEditDistance(meta1, meta2, method);
571
572 // one point for first letter of word being same
573 //score += (tolower((unsigned char) word1[0]) == tolower((unsigned char) word2[0]));
574
575 // one point for identical lengths of words
576 //score += (word1.length() == word2.length());
577
578 return (int)score;
579 }
580
581
582 /////////////////////////////////////////////////////////////////////////////
583 ///
584 /// Porter's Stemming Algorithm
585 ///
586
587 enum ECharType {
588 eOther,
589 eConsonant,
590 eVowel
591 };
592
593 static ECharType s_char_type[256];
594
595 struct SFillTypes
596 {
597 SFillTypes()
598 {
599 // This cycle is processed in backward order to avoid buggy
600 // optimization by ICC 9.1 on 64-bit platforms.
601 // The optimizer calls buggy intel_fast_mem(cpy|set) even with
602 // -fno-builtin-memset -fno-builtin-memcpy.
603 for (int i = 256; i--; ) {
604 s_char_type[i] = eOther;
605 }
606
607 for (int i = 0; i < 26; ++i) {
608 s_char_type[i + 'a'] = eConsonant;
609 s_char_type[i + 'A'] = eConsonant;
610 }
611
612 s_char_type[(int)'a'] = eVowel;
613 s_char_type[(int)'e'] = eVowel;
614 s_char_type[(int)'i'] = eVowel;
615 s_char_type[(int)'o'] = eVowel;
616 s_char_type[(int)'u'] = eVowel;
617 }
618 };
619 static SFillTypes s_AutoFiller;
620
621 static inline ECharType s_GetCharType(int c)
622 {
623 _ASSERT(c < 256 && c >= 0);
624 return s_char_type[c];
625 }
626
627 static inline int s_MeasureWord(string::const_iterator iter,
628 string::const_iterator end)
629 {
630 int m = 0;
631
632 /**
633 {{
634 ECharType first_char_type = s_GetCharType(*iter);
635 while (iter != end && s_GetCharType(*iter) == first_char_type) {
636 ++iter;
637 }
638 }}
639 **/
640
641
642 // skip leading entities
643 ECharType prev_type = s_GetCharType(*iter);
644 for ( ; iter != end; ++iter) {
645 ECharType type = s_GetCharType(*iter);
646 if (type != prev_type) {
647 prev_type = type;
648 break;
649 }
650 }
651
652 //prev_type = s_GetCharType(*iter);
653 for ( ; iter != end; ++iter) {
654 ECharType type = s_GetCharType(*iter);
655 if (type != prev_type) {
656 prev_type = type;
657 ++m;
658 }
659 }
660 /**
661 for ( ; iter != end; ++m) {
662 string::const_iterator prev(iter);
663 ECharType prev_type = s_GetCharType(*prev);
664 for (++iter; iter != end; ) {
665 ECharType type = s_GetCharType(*iter);
666 if (type != prev_type) {
667 break;
668 }
669 prev_type = type;
670 prev = iter++;
671 }
672 if (iter != end) {
673 prev = iter;
674 prev_type = s_GetCharType(*prev);
675 for (++iter; iter != end; ) {
676 ECharType type = s_GetCharType(*iter);
677 if (type != prev_type) {
678 break;
679 }
680 prev_type = type;
681 prev = iter++;
682 }
683 }
684 }
685 **/
686
687 return m;
688 }
689
690
691 static inline bool s_EndsWith(const string& str1, const string& str2)
692 {
693 string::const_reverse_iterator iter1(str1.end());
694 string::const_reverse_iterator end1 (str1.begin());
695 string::const_reverse_iterator iter2(str2.end());
696 string::const_reverse_iterator end2 (str2.begin());
697 for ( ; iter1 != end1 && iter2 != end2; ++iter1, ++iter2) {
698 if (*iter1 != *iter2) {
699 return false;
700 }
701 }
702 return true;
703 }
704
705 static inline bool s_EndsWith(const string& str1, const char* p)
706 {
707 string::const_reverse_iterator iter1(str1.end());
708 string::const_reverse_iterator end1 (str1.begin());
709 const char* iter2 = p + strlen(p) - 1;
710 const char* end2 = p - 1;
711 for ( ; iter1 != end1 && iter2 != end2; ++iter1, --iter2) {
712 if (*iter1 != *iter2) {
713 return false;
714 }
715 }
716 return true;
717 }
718
719 static string::size_type s_FindFirstVowel(const string& str)
720 {
721 for (string::size_type i = 0; i < str.size(); ++i) {
722 if (s_GetCharType(str[i]) == eVowel) {
723 return i;
724 }
725 }
726 return string::npos;
727 }
728
729 static inline bool s_ReplaceEnding(string& word,
730 const string& match,
731 const string& substitute,
732 int min_measure = 0)
733 {
734 if (word.length() < match.length()) {
735 return false;
736 }
737
738 if ( !s_EndsWith(word, match) ) {
739 return false;
740 }
741
742 if (s_MeasureWord(word.begin(),
743 word.end() - match.length()) <= min_measure) {
744 return false;
745 }
746
747 word.erase(word.length() - match.length());
748 word += substitute;
749 return true;
750 }
751
752
753 static inline bool s_ReplaceEnding(string& word,
754 const char* match,
755 const char* substitute,
756 int min_measure = 0)
757 {
758 size_t match_len = strlen(match);
759 if (word.length() < match_len) {
760 return false;
761 }
762
763 if ( !s_EndsWith(word, match) ) {
764 return false;
765 }
766
767 if (s_MeasureWord(word.begin(),
768 word.end() - match_len) <= min_measure) {
769 return false;
770 }
771
772 word.erase(word.length() - match_len);
773 word += substitute;
774 return true;
775 }
776
777
778 void CDictionaryUtil::Stem(const string& in_str, string* out_str)
779 {
780 *out_str = in_str;
781 string& str = *out_str;
782 //NStr::ToLower(str);
783
784 // step 1a: common 's' endings
785 //
786 // sses -> ss
787 // ies -> i
788 // ss -> ss
789 // s ->
790 if (str[ str.length()-1 ] == 's') {
791 do {
792 if (s_ReplaceEnding(str, "sses", "ss")) {
793 break;
794 }
795
796 if (s_ReplaceEnding(str, "ies", "i")) {
797 break;
798 }
799
800 if ( !s_EndsWith(str, "ss") ) {
801 s_ReplaceEnding(str, "s", "");
802 }
803 }
804 while (false);
805 }
806
807 // step 1b: ed/ing
808 //
809 // eed -> ed (not .eed)
810 // ed -> (*v*)
811 // ing -> (*v*)
812 if (s_EndsWith(str, "eed") && str.length() > 4) {
813 str.erase(str.length() - 1);
814 } else {
815 bool extra = false;
816 if (s_EndsWith(str, "ed") &&
817 s_FindFirstVowel(str) < str.length() - 3) {
818 str.erase(str.length() - 2);
819 extra = true;
820 } else if (s_EndsWith(str, "ing") &&
821 s_FindFirstVowel(str) < str.length() - 3) {
822 str.erase(str.length() - 3);
823 extra = true;
824 }
825
826 if (extra) {
827 if (s_EndsWith(str, "at") ||
828 s_EndsWith(str, "bl") ||
829 s_EndsWith(str, "iz")) {
830 str += 'e';
831 } else if (str[str.length() - 1] != 'l' &&
832 str[str.length() - 1] != 's' &&
833 str[str.length() - 1] != 'z' &&
834 str[str.length() - 1] == str[str.length() - 2]) {
835 str.erase(str.length() - 1);
836 } else if (str.length() == 3 &&
837 s_GetCharType(str[0]) == eConsonant &&
838 s_GetCharType(str[1]) == eVowel &&
839 s_GetCharType(str[2]) == eConsonant) {
840 str += 'e';
841 }
842 }
843 }
844
845 // step 1c: y -> i
846 if (str[str.length() - 1] == 'y' &&
847 s_FindFirstVowel(str) < str.length() - 1) {
848 str[str.length() - 1] = 'i';
849 }
850
851 // step 2
852
853 if (str.length() > 3) {
854 switch (str[ str.length() - 2 ]) {
855 case 'a':
856 if ( !s_ReplaceEnding(str, "ational", "ate") ) {
857 s_ReplaceEnding(str, "tional", "tion");
858 }
859 break;
860
861 case 'c':
862 if ( !s_ReplaceEnding(str, "enci", "ence") ) {
863 s_ReplaceEnding(str, "anci", "ance");
864 }
865 break;
866
867 case 'e':
868 s_ReplaceEnding(str, "izer", "ize");
869 break;
870
871 case 'l':
872 if (str[ str.length()-1 ] == 'i') {
873 if ( !s_ReplaceEnding(str, "abli", "able") ) {
874 if ( !s_ReplaceEnding(str, "alli", "al") ) {
875 if ( !s_ReplaceEnding(str, "entli", "ent") ) {
876 if ( !s_ReplaceEnding(str, "eli", "e") ) {
877 s_ReplaceEnding(str, "ousli", "ous");
878 }
879 }
880 }
881 }
882 }
883 break;
884
885 case 'o':
886 if ( !s_ReplaceEnding(str, "ization", "ize") ) {
887 if ( !s_ReplaceEnding(str, "ation", "ate") ) {
888 s_ReplaceEnding(str, "ator", "ate");
889 }
890 }
891 break;
892
893 case 's':
894 if ( !s_ReplaceEnding(str, "alism", "al") ) {
895 if ( !s_ReplaceEnding(str, "iveness", "ive") ) {
896 if ( !s_ReplaceEnding(str, "fulness", "ful") ) {
897 s_ReplaceEnding(str, "ousness", "ous");
898 }
899 }
900 }
901 break;
902
903 case 't':
904 if ( !s_ReplaceEnding(str, "aliti", "al") ) {
905 if ( !s_ReplaceEnding(str, "iviti", "ive") ) {
906 s_ReplaceEnding(str, "biliti", "ble");
907 }
908 }
909 break;
910
911 default:
912 break;
913 }
914 }
915
916 // 'us' endings
917 //s_ReplaceEnding(str, "u", "us");
918
919 // step 3
920 typedef pair<const char*, const char*> TReplace;
921 static const TReplace rep_step3[] = {
922 TReplace("icate", "ic"),
923 TReplace("ative", ""),
924 TReplace("alize", "al"),
925 TReplace("iciti", "ic"),
926 TReplace("ical", "ic"),
927 TReplace("ful", ""),
928 TReplace("ness", ""),
929
930 /// end
931 TReplace(NULL, NULL)
932 };
933 {{
934 static string s_Step3_Endings("eils");
935 if (s_Step3_Endings.find_first_of(str[ str.length()-1 ]) != string::npos) {
936 for (const TReplace* p = rep_step3; p->first; ++p) {
937 if (s_ReplaceEnding(str, p->first, p->second)) {
938 break;
939 }
940 }
941 }
942 }}
943
944
945 // step 4
946 if (str.length() > 2) {
947 switch (str[ str.length() - 2]) {
948 case 'a':
949 if (str[ str.length()-1 ] == 'l') {
950 if (s_ReplaceEnding(str, "ual", "", 1)) {
951 break;
952 }
953 if (s_ReplaceEnding(str, "ial", "", 1)) {
954 break;
955 }
956 s_ReplaceEnding(str, "al", "", 1);
957 }
958 break;
959
960 case 'c':
961 if (str[ str.length()-1 ] == 'e') {
962 if ( !s_ReplaceEnding(str, "ance", "", 1) ) {
963 s_ReplaceEnding(str, "ence", "", 1);
964 }
965 }
966 break;
967
968 case 'e':
969 s_ReplaceEnding(str, "er", "", 1);
970 break;
971
972 case 'i':
973 if (s_ReplaceEnding(str, "ix", "ic", 0)) {
974 break;
975 }
976 s_ReplaceEnding(str, "ic", "", 1);
977 break;
978
979 case 'l':
980 if ( !s_ReplaceEnding(str, "able", "", 1) ) {
981 s_ReplaceEnding(str, "ible", "", 1);
982 }
983 break;
984
985 case 'n':
986 if ( !s_ReplaceEnding(str, "ant", "", 1) ) {
987 if ( !s_ReplaceEnding(str, "ement", "", 1) ) {
988 if ( !s_ReplaceEnding(str, "ment", "", 1) ) {
989 s_ReplaceEnding(str, "ent", "", 1);
990 }
991 }
992 }
993 break;
994
995 case 'o':
996 if ( !s_ReplaceEnding(str, "sion", "s", 1) ) {
997 if ( !s_ReplaceEnding(str, "tion", "t", 1) ) {
998 s_ReplaceEnding(str, "ou", "", 1);
999 }
1000 }
1001 break;
1002
1003 case 's':
1004 s_ReplaceEnding(str, "ism", "", 1);
1005 break;
1006
1007 case 't':
1008 if ( !s_ReplaceEnding(str, "ate", "", 1) ) {
1009 s_ReplaceEnding(str, "iti", "", 1);
1010 }
1011 break;
1012
1013 case 'u':
1014 s_ReplaceEnding(str, "ous", "", 1);
1015 break;
1016
1017 case 'v':
1018 s_ReplaceEnding(str, "ive", "", 1);
1019 break;
1020
1021 case 'z':
1022 s_ReplaceEnding(str, "ize", "", 1);
1023 break;
1024 }
1025 }
1026
1027 // step 5a
1028 s_ReplaceEnding(str, "e", "", 1);
1029
1030 // step 5b
1031 if (s_MeasureWord(str.begin(), str.end()) > 1 &&
1032 str[str.length() - 1] == 'l' &&
1033 str[str.length() - 2] == 'l') {
1034 str.erase(str.length() - 1);
1035 }
1036
1037 }
1038
1039
1040 END_NCBI_SCOPE
1041 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |