NCBI C++ Toolkit Cross Reference

C++/src/util/strsearch.cpp


  1 /*  $Id: strsearch.cpp 103491 2007-05-04 17:18:18Z kazimird $
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *               National Center for Biotechnology Information
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government have not placed any restriction on its use or reproduction.
 13 *
 14 *  Although all reasonable efforts have been taken to ensure the accuracy
 15 *  and reliability of the software and data, the NLM and the U.S.
 16 *  Government do not and cannot warrant the performance or results that
 17 *  may be obtained by using this software or data. The NLM and the U.S.
 18 *  Government disclaim all warranties, express or implied, including
 19 *  warranties of performance, merchantability or fitness for any particular
 20 *  purpose.
 21 *
 22 *  Please cite the author in any work or product based on this material.
 23 *
 24 * ===========================================================================
 25 *
 26 * Author:  Mati Shomrat
 27 *
 28 * File Description:
 29 *   String search utilities.
 30 *
 31 *   Currently there are two search utilities:
 32 *   1. An implementation of the Boyer-Moore algorithm.
 33 *   2. A finite state automaton.
 34 *
 35 */
 36 
 37 #include <ncbi_pch.hpp>
 38 #include <util/strsearch.hpp>
 39 #include <algorithm>
 40 #include <vector>
 41 
 42 
 43 BEGIN_NCBI_SCOPE
 44 
 45 
 46 //==============================================================================
 47 //                            CBoyerMooreMatcher
 48 //==============================================================================
 49 
 50 // Public:
 51 // =======
 52 
 53 CBoyerMooreMatcher::CBoyerMooreMatcher(const string& pattern, 
 54                                        NStr::ECase   case_sensitive,
 55                                        unsigned int  whole_word)
 56 : m_Pattern(pattern), 
 57   m_PatLen(pattern.length()), 
 58   m_CaseSensitive(case_sensitive), 
 59   m_WholeWord(whole_word),
 60   m_LastOccurrence(sm_AlphabetSize),
 61   m_WordDelimiters(sm_AlphabetSize)
 62 {
 63     x_InitPattern();
 64     // Init the word deimiting alphabet
 65     if (m_WholeWord) {
 66         for (int i = 0; i < sm_AlphabetSize; ++i) {
 67             m_WordDelimiters[i] = (isspace((unsigned char) i) != 0);
 68         }
 69     }
 70 }
 71 
 72 CBoyerMooreMatcher::CBoyerMooreMatcher(const string& pattern,
 73                                        const string& word_delimeters,
 74                                        NStr::ECase   case_sensitive,
 75                                        bool          invert_delimiters)
 76 : m_Pattern(pattern), 
 77   m_PatLen(pattern.length()), 
 78   m_CaseSensitive(case_sensitive), 
 79   m_WholeWord(true),
 80   m_LastOccurrence(sm_AlphabetSize),
 81   m_WordDelimiters(sm_AlphabetSize)
 82 {
 83     x_InitPattern();
 84     SetWordDelimiters(word_delimeters, invert_delimiters);
 85 }
 86 
 87 void CBoyerMooreMatcher::SetWordDelimiters(const string& word_delimeters,
 88                                            bool          invert_delimiters)
 89 {
 90     m_WholeWord = eWholeWordMatch;
 91 
 92     string word_d = word_delimeters;
 93     if (m_CaseSensitive == NStr::eNocase) {
 94         NStr::ToUpper(word_d);
 95     }
 96 
 97     // Init the word delimiting alphabet
 98     for (int i = 0; i < sm_AlphabetSize; ++i) {
 99         char ch = m_CaseSensitive ? i : toupper((unsigned char) i);
100         string::size_type n = word_d.find_first_of(ch);
101         m_WordDelimiters[i] = (!invert_delimiters) == (n != string::npos);
102     }
103 }
104 
105 void CBoyerMooreMatcher::AddDelimiters(const string& word_delimeters)
106 {
107     if (m_WholeWord == 0) {
108         m_WholeWord = eWholeWordMatch;
109     }
110 
111     string word_d = word_delimeters;
112     if (m_CaseSensitive == NStr::eNocase) {
113         NStr::ToUpper(word_d);
114     }
115 
116     for (int i = 0; i < sm_AlphabetSize; ++i) {
117         char ch = m_CaseSensitive ? i : toupper((unsigned char) i);
118         string::size_type n = word_d.find_first_of(ch);
119         if (n != NPOS) {
120             m_WordDelimiters[i] = true;
121         }
122     }
123 }
124 
125 void CBoyerMooreMatcher::AddDelimiters(char ch)
126 {
127     if (m_WholeWord == 0) {
128         m_WholeWord = eWholeWordMatch;
129     }
130     m_WordDelimiters[ch] = true;
131 
132     if (m_CaseSensitive == NStr::eNocase) {
133         ch = toupper((unsigned char) ch);
134     }
135     
136     m_WordDelimiters[ch] = true;
137 }
138 
139 void CBoyerMooreMatcher::InitCommonDelimiters()
140 {
141     if (m_WholeWord == 0) {
142         m_WholeWord = eWholeWordMatch;
143     }
144 
145     for (int i = 0; i < sm_AlphabetSize; ++i) {
146         char ch = m_CaseSensitive ? i : toupper((unsigned char) i);
147         if ((ch >= 'A' && ch <= 'Z') ||
148             (ch >= '0' && ch <= '9') ||
149             (ch == '_')){
150         } else {
151             m_WordDelimiters[i] = true;
152         }
153     }
154 }
155 
156 void CBoyerMooreMatcher::x_InitPattern(void)
157 {
158     if ( m_CaseSensitive == NStr::eNocase) {
159         NStr::ToUpper(m_Pattern);
160     }
161     
162     // For each character in the alpahbet compute its last occurrence in 
163     // the pattern.
164     
165     // Initilalize vector
166     size_t size = m_LastOccurrence.size();
167     for ( size_t i = 0;  i < size;  ++i ) {
168         m_LastOccurrence[i] = m_PatLen;
169     }
170     
171     // compute right-most occurrence
172     for ( int j = 0;  j < (int)m_PatLen - 1;  ++j ) {
173         /* int lo = */
174                 m_LastOccurrence[(int)m_Pattern[j]] = m_PatLen - j - 1;
175    }
176 }
177 
178 
179 SIZE_TYPE CBoyerMooreMatcher::Search(const char*  text, 
180                                      SIZE_TYPE shift,
181                                      SIZE_TYPE text_len) const
182 {
183     // Implementation note.
184     // Case sensitivity check has been taken out of loop. 
185     // Code size for performance optimization. (We generally choose speed).
186     // (Anatoliy)
187     if (m_CaseSensitive == NStr::eCase) {
188         while (shift + m_PatLen <= text_len) {
189             int j = (int)m_PatLen - 1;
190 
191             for(char text_char = text[shift + j];
192                 j >= 0 && m_Pattern[j]==(text_char=text[shift + j]);
193                 --j) {}
194 
195             if ( (j == -1)  &&  IsWholeWord(text, shift, text_len) ) {
196                 return  shift;
197             } else {
198                 shift += (unsigned int)m_LastOccurrence[text[shift + j]];
199             }
200         }
201     } else { // case insensitive NStr::eNocase
202         while (shift + m_PatLen <= text_len) {
203             int j = (int)m_PatLen - 1;
204 
205             for(char text_char = toupper((unsigned char) text[shift + j]);
206                 j >= 0 && m_Pattern[j]==(text_char=toupper((unsigned char) text[shift + j]));
207                 --j) {}
208 
209             if ( (j == -1)  &&  IsWholeWord(text, shift, text_len) ) {
210                 return  shift;
211             } else {
212                 shift += 
213                     (unsigned int)m_LastOccurrence[toupper((unsigned char) text[shift + j])];
214             }
215         }
216     }
217     return (SIZE_TYPE)-1;
218 }
219 
220 
221 // Private:
222 // ========
223 
224 // Constants
225 const int CBoyerMooreMatcher::sm_AlphabetSize = 256;     // assuming ASCII
226 
227 
228 // Member Functions
229 bool CBoyerMooreMatcher::IsWholeWord(const char*  text, 
230                                      SIZE_TYPE    pos,
231                                      SIZE_TYPE    text_len) const
232 {
233     SIZE_TYPE left, right;
234     left = right = 1;
235 
236     // Words at the beginning and end of text are also considered "whole"
237 
238     // check on the left  
239     if (m_WholeWord & ePrefixMatch) {
240         left = (pos == 0) ||
241                ((pos > 0) && m_WordDelimiters[text[pos - 1]]);
242     }
243 
244     // check on the right
245     if (m_WholeWord & eSuffixMatch) {
246         pos += (unsigned int)m_PatLen;
247         right = (pos == text_len) || 
248                 ((pos < text_len) && m_WordDelimiters[text[pos]]);
249     }
250 
251 
252     return (left && right);
253 }
254 
255 
256 END_NCBI_SCOPE
257 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.