|
NCBI Home IEB Home C++ Toolkit docs C Toolkit source browser C Toolkit source browser (2) |
NCBI C++ Toolkit Cross ReferenceC++/src/util/strsearch.cpp |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: strsearch.cpp 103491 2007-05-04 17:18:18Z kazimird $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat
27 *
28 * File Description:
29 * String search utilities.
30 *
31 * Currently there are two search utilities:
32 * 1. An implementation of the Boyer-Moore algorithm.
33 * 2. A finite state automaton.
34 *
35 */
36
37 #include <ncbi_pch.hpp>
38 #include <util/strsearch.hpp>
39 #include <algorithm>
40 #include <vector>
41
42
43 BEGIN_NCBI_SCOPE
44
45
46 //==============================================================================
47 // CBoyerMooreMatcher
48 //==============================================================================
49
50 // Public:
51 // =======
52
53 CBoyerMooreMatcher::CBoyerMooreMatcher(const string& pattern,
54 NStr::ECase case_sensitive,
55 unsigned int whole_word)
56 : m_Pattern(pattern),
57 m_PatLen(pattern.length()),
58 m_CaseSensitive(case_sensitive),
59 m_WholeWord(whole_word),
60 m_LastOccurrence(sm_AlphabetSize),
61 m_WordDelimiters(sm_AlphabetSize)
62 {
63 x_InitPattern();
64 // Init the word deimiting alphabet
65 if (m_WholeWord) {
66 for (int i = 0; i < sm_AlphabetSize; ++i) {
67 m_WordDelimiters[i] = (isspace((unsigned char) i) != 0);
68 }
69 }
70 }
71
72 CBoyerMooreMatcher::CBoyerMooreMatcher(const string& pattern,
73 const string& word_delimeters,
74 NStr::ECase case_sensitive,
75 bool invert_delimiters)
76 : m_Pattern(pattern),
77 m_PatLen(pattern.length()),
78 m_CaseSensitive(case_sensitive),
79 m_WholeWord(true),
80 m_LastOccurrence(sm_AlphabetSize),
81 m_WordDelimiters(sm_AlphabetSize)
82 {
83 x_InitPattern();
84 SetWordDelimiters(word_delimeters, invert_delimiters);
85 }
86
87 void CBoyerMooreMatcher::SetWordDelimiters(const string& word_delimeters,
88 bool invert_delimiters)
89 {
90 m_WholeWord = eWholeWordMatch;
91
92 string word_d = word_delimeters;
93 if (m_CaseSensitive == NStr::eNocase) {
94 NStr::ToUpper(word_d);
95 }
96
97 // Init the word delimiting alphabet
98 for (int i = 0; i < sm_AlphabetSize; ++i) {
99 char ch = m_CaseSensitive ? i : toupper((unsigned char) i);
100 string::size_type n = word_d.find_first_of(ch);
101 m_WordDelimiters[i] = (!invert_delimiters) == (n != string::npos);
102 }
103 }
104
105 void CBoyerMooreMatcher::AddDelimiters(const string& word_delimeters)
106 {
107 if (m_WholeWord == 0) {
108 m_WholeWord = eWholeWordMatch;
109 }
110
111 string word_d = word_delimeters;
112 if (m_CaseSensitive == NStr::eNocase) {
113 NStr::ToUpper(word_d);
114 }
115
116 for (int i = 0; i < sm_AlphabetSize; ++i) {
117 char ch = m_CaseSensitive ? i : toupper((unsigned char) i);
118 string::size_type n = word_d.find_first_of(ch);
119 if (n != NPOS) {
120 m_WordDelimiters[i] = true;
121 }
122 }
123 }
124
125 void CBoyerMooreMatcher::AddDelimiters(char ch)
126 {
127 if (m_WholeWord == 0) {
128 m_WholeWord = eWholeWordMatch;
129 }
130 m_WordDelimiters[ch] = true;
131
132 if (m_CaseSensitive == NStr::eNocase) {
133 ch = toupper((unsigned char) ch);
134 }
135
136 m_WordDelimiters[ch] = true;
137 }
138
139 void CBoyerMooreMatcher::InitCommonDelimiters()
140 {
141 if (m_WholeWord == 0) {
142 m_WholeWord = eWholeWordMatch;
143 }
144
145 for (int i = 0; i < sm_AlphabetSize; ++i) {
146 char ch = m_CaseSensitive ? i : toupper((unsigned char) i);
147 if ((ch >= 'A' && ch <= 'Z') ||
148 (ch >= '0' && ch <= '9') ||
149 (ch == '_')){
150 } else {
151 m_WordDelimiters[i] = true;
152 }
153 }
154 }
155
156 void CBoyerMooreMatcher::x_InitPattern(void)
157 {
158 if ( m_CaseSensitive == NStr::eNocase) {
159 NStr::ToUpper(m_Pattern);
160 }
161
162 // For each character in the alpahbet compute its last occurrence in
163 // the pattern.
164
165 // Initilalize vector
166 size_t size = m_LastOccurrence.size();
167 for ( size_t i = 0; i < size; ++i ) {
168 m_LastOccurrence[i] = m_PatLen;
169 }
170
171 // compute right-most occurrence
172 for ( int j = 0; j < (int)m_PatLen - 1; ++j ) {
173 /* int lo = */
174 m_LastOccurrence[(int)m_Pattern[j]] = m_PatLen - j - 1;
175 }
176 }
177
178
179 SIZE_TYPE CBoyerMooreMatcher::Search(const char* text,
180 SIZE_TYPE shift,
181 SIZE_TYPE text_len) const
182 {
183 // Implementation note.
184 // Case sensitivity check has been taken out of loop.
185 // Code size for performance optimization. (We generally choose speed).
186 // (Anatoliy)
187 if (m_CaseSensitive == NStr::eCase) {
188 while (shift + m_PatLen <= text_len) {
189 int j = (int)m_PatLen - 1;
190
191 for(char text_char = text[shift + j];
192 j >= 0 && m_Pattern[j]==(text_char=text[shift + j]);
193 --j) {}
194
195 if ( (j == -1) && IsWholeWord(text, shift, text_len) ) {
196 return shift;
197 } else {
198 shift += (unsigned int)m_LastOccurrence[text[shift + j]];
199 }
200 }
201 } else { // case insensitive NStr::eNocase
202 while (shift + m_PatLen <= text_len) {
203 int j = (int)m_PatLen - 1;
204
205 for(char text_char = toupper((unsigned char) text[shift + j]);
206 j >= 0 && m_Pattern[j]==(text_char=toupper((unsigned char) text[shift + j]));
207 --j) {}
208
209 if ( (j == -1) && IsWholeWord(text, shift, text_len) ) {
210 return shift;
211 } else {
212 shift +=
213 (unsigned int)m_LastOccurrence[toupper((unsigned char) text[shift + j])];
214 }
215 }
216 }
217 return (SIZE_TYPE)-1;
218 }
219
220
221 // Private:
222 // ========
223
224 // Constants
225 const int CBoyerMooreMatcher::sm_AlphabetSize = 256; // assuming ASCII
226
227
228 // Member Functions
229 bool CBoyerMooreMatcher::IsWholeWord(const char* text,
230 SIZE_TYPE pos,
231 SIZE_TYPE text_len) const
232 {
233 SIZE_TYPE left, right;
234 left = right = 1;
235
236 // Words at the beginning and end of text are also considered "whole"
237
238 // check on the left
239 if (m_WholeWord & ePrefixMatch) {
240 left = (pos == 0) ||
241 ((pos > 0) && m_WordDelimiters[text[pos - 1]]);
242 }
243
244 // check on the right
245 if (m_WholeWord & eSuffixMatch) {
246 pos += (unsigned int)m_PatLen;
247 right = (pos == text_len) ||
248 ((pos < text_len) && m_WordDelimiters[text[pos]]);
249 }
250
251
252 return (left && right);
253 }
254
255
256 END_NCBI_SCOPE
257 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |