|
NCBI Home IEB Home C++ Toolkit docs C Toolkit source browser C Toolkit source browser (2) |
NCBI C++ Toolkit Cross ReferenceC++/src/util/dictionary.cpp |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: dictionary.cpp 103491 2007-05-04 17:18:18Z kazimird $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <util/dictionary.hpp>
34 #include <util/dictionary_util.hpp>
35 #include <algorithm>
36
37
38 BEGIN_NCBI_SCOPE
39
40
41
42 CSimpleDictionary::CSimpleDictionary(size_t meta_key_size)
43 : m_MetaphoneKeySize(meta_key_size)
44 {
45 }
46
47
48 CSimpleDictionary::CSimpleDictionary(const string& file,
49 size_t meta_key_size)
50 : m_MetaphoneKeySize(meta_key_size)
51 {
52 CNcbiIfstream istr(file.c_str());
53 Read(istr);
54 }
55
56
57 CSimpleDictionary::CSimpleDictionary(CNcbiIstream& istr,
58 size_t meta_key_size)
59 : m_MetaphoneKeySize(meta_key_size)
60 {
61 Read(istr);
62 }
63
64
65 void CSimpleDictionary::Read(CNcbiIstream& istr)
66 {
67 string line;
68 string metaphone;
69 string word;
70 while (NcbiGetlineEOL(istr, line)) {
71 string::size_type pos = line.find_first_of("|");
72 if (pos == string::npos) {
73 word = line;
74 CDictionaryUtil::GetMetaphone(word, &metaphone, m_MetaphoneKeySize);
75 } else {
76 metaphone = line.substr(0, pos);
77 word = line.substr(pos + 1, line.length() - pos - 1);
78 }
79
80 // insert forward and reverse dictionary pairings
81 m_ForwardDict.insert(m_ForwardDict.end(), word);
82 TStringSet& word_set = m_ReverseDict[metaphone];
83 word_set.insert(word_set.end(), word);
84 }
85 }
86
87 void CSimpleDictionary::Write(CNcbiOstream& ostr) const
88 {
89 ITERATE (TReverseDict, iter, m_ReverseDict) {
90 ITERATE (TStringSet, word_iter, iter->second) {
91 ostr << iter->first << "|" << *word_iter << endl;
92 }
93 }
94 }
95
96
97 void CSimpleDictionary::AddWord(const string& word)
98 {
99 if (word.empty()) {
100 return;
101 }
102
103 // compute the metaphone code
104 string metaphone;
105 CDictionaryUtil::GetMetaphone(word, &metaphone, m_MetaphoneKeySize);
106
107 // insert forward and reverse dictionary pairings
108 m_ForwardDict.insert(word);
109 m_ReverseDict[metaphone].insert(word);
110 }
111
112
113 bool CSimpleDictionary::CheckWord(const string& word) const
114 {
115 TForwardDict::const_iterator iter = m_ForwardDict.find(word);
116 return (iter != m_ForwardDict.end());
117 }
118
119
120 void CSimpleDictionary::SuggestAlternates(const string& word,
121 TAlternates& alternates,
122 size_t max_alts) const
123 {
124 string metaphone;
125 CDictionaryUtil::GetMetaphone(word, &metaphone, m_MetaphoneKeySize);
126 list<TReverseDict::const_iterator> keys;
127 x_GetMetaphoneKeys(metaphone, keys);
128
129 typedef set<SAlternate, SAlternatesByScore> TAltSet;
130 TAltSet words;
131
132 SAlternate alt;
133 size_t count = 0;
134 ITERATE (list<TReverseDict::const_iterator>, key_iter, keys) {
135
136 bool used = false;
137 ITERATE (TStringSet, set_iter, (*key_iter)->second) {
138 // score:
139 // start with edit distance
140 alt.score =
141 CDictionaryUtil::Score(word, metaphone,
142 *set_iter, (*key_iter)->first);
143 if (alt.score <= 0) {
144 continue;
145 }
146
147 _TRACE(" hit: "
148 << metaphone << " <-> " << (*key_iter)->first
149 << ", " << word << " <-> " << *set_iter
150 << " (" << alt.score << ")");
151 used = true;
152 alt.alternate = *set_iter;
153 words.insert(alt);
154 }
155 count += used ? 1 : 0;
156 }
157
158 _TRACE(word << ": "
159 << keys.size() << " keys searched "
160 << count << " keys used");
161
162 if ( !words.empty() ) {
163 TAlternates alts;
164 TAltSet::const_iterator iter = words.begin();
165 alts.push_back(*iter);
166 TAltSet::const_iterator prev = iter;
167 for (++iter;
168 iter != words.end() &&
169 (alts.size() < max_alts || prev->score == iter->score);
170 ++iter) {
171 alts.push_back(*iter);
172 prev = iter;
173 }
174
175 alternates.insert(alternates.end(), alts.begin(), alts.end());
176 }
177 }
178
179
180 void CSimpleDictionary::x_GetMetaphoneKeys(const string& metaphone,
181 list<TReverseDict::const_iterator>& keys) const
182 {
183 if ( !metaphone.length() ) {
184 return;
185 }
186
187 const size_t max_meta_edit_dist = 1;
188 const CDictionaryUtil::EDistanceMethod method =
189 CDictionaryUtil::eEditDistance_Similar;
190
191 string::const_iterator iter = metaphone.begin();
192 string::const_iterator end = iter + max_meta_edit_dist + 1;
193
194 size_t count = 0;
195 _TRACE("meta key: " << metaphone);
196 for ( ; iter != end; ++iter) {
197 string seed(1, *iter);
198 _TRACE(" meta key seed: " << seed);
199 TReverseDict::const_iterator lower = m_ReverseDict.lower_bound(seed);
200 for ( ;
201 lower != m_ReverseDict.end() && lower->first[0] == *iter;
202 ++lower, ++count) {
203
204 size_t dist =
205 CDictionaryUtil::GetEditDistance(lower->first, metaphone,
206 method);
207 if (dist > max_meta_edit_dist) {
208 continue;
209 }
210
211 keys.push_back(lower);
212 }
213 }
214
215 _TRACE("exmained " << count << " keys, returning " << keys.size());
216 }
217
218
219 /////////////////////////////////////////////////////////////////////////////
220 ///
221 /// CMultiDictionary
222 ///
223
224 struct SDictByPriority {
225 bool operator()(const CMultiDictionary::SDictionary& d1,
226 const CMultiDictionary::SDictionary& d2) const
227 {
228 return (d1.priority < d2.priority);
229 }
230 };
231
232 void CMultiDictionary::RegisterDictionary(IDictionary& dict, int priority)
233 {
234 SDictionary d;
235 d.dict.Reset(&dict);
236 d.priority = priority;
237
238 m_Dictionaries.push_back(d);
239 std::sort(m_Dictionaries.begin(), m_Dictionaries.end(), SDictByPriority());
240 }
241
242
243 bool CMultiDictionary::CheckWord(const string& word) const
244 {
245 ITERATE (TDictionaries, iter, m_Dictionaries) {
246 if ( iter->dict->CheckWord(word) ) {
247 return true;
248 }
249 }
250
251 return false;
252 }
253
254
255 void CMultiDictionary::SuggestAlternates(const string& word,
256 TAlternates& alternates,
257 size_t max_alts) const
258 {
259 TAlternates alts;
260 ITERATE (TDictionaries, iter, m_Dictionaries) {
261 iter->dict->SuggestAlternates(word, alts, max_alts);
262 }
263
264 std::sort(alts.begin(), alts.end(), SAlternatesByScore());
265 if (alts.size() > max_alts) {
266 TAlternates::iterator prev = alts.begin() + max_alts;
267 TAlternates::iterator iter = prev;
268 ++iter;
269 for ( ; iter != alts.end() && iter->score == prev->score; ++iter) {
270 prev = iter;
271 }
272 alts.erase(iter, alts.end());
273 }
274
275 alternates.swap(alts);
276 }
277
278
279 CCachedDictionary::CCachedDictionary(IDictionary& dict)
280 : m_Dict(&dict)
281 {
282 }
283
284
285 bool CCachedDictionary::CheckWord(const string& word) const
286 {
287 return m_Dict->CheckWord(word);
288 }
289
290
291 void CCachedDictionary::SuggestAlternates(const string& word,
292 TAlternates& alternates,
293 size_t max_alts) const
294 {
295 TAltCache::iterator iter = m_Misses.find(word);
296 if (iter != m_Misses.end()) {
297 alternates = iter->second;
298 return;
299 }
300
301 m_Dict->SuggestAlternates(word, m_Misses[word], max_alts);
302 alternates = m_Misses[word];
303 }
304
305
306 END_NCBI_SCOPE
307 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |