NCBI C++ Toolkit Cross Reference

C++/src/util/dictionary.cpp


  1 /*  $Id: dictionary.cpp 103491 2007-05-04 17:18:18Z kazimird $
  2  * ===========================================================================
  3  *
  4  *                            PUBLIC DOMAIN NOTICE
  5  *               National Center for Biotechnology Information
  6  *
  7  *  This software/database is a "United States Government Work" under the
  8  *  terms of the United States Copyright Act.  It was written as part of
  9  *  the author's official duties as a United States Government employee and
 10  *  thus cannot be copyrighted.  This software/database is freely available
 11  *  to the public for use. The National Library of Medicine and the U.S.
 12  *  Government have not placed any restriction on its use or reproduction.
 13  *
 14  *  Although all reasonable efforts have been taken to ensure the accuracy
 15  *  and reliability of the software and data, the NLM and the U.S.
 16  *  Government do not and cannot warrant the performance or results that
 17  *  may be obtained by using this software or data. The NLM and the U.S.
 18  *  Government disclaim all warranties, express or implied, including
 19  *  warranties of performance, merchantability or fitness for any particular
 20  *  purpose.
 21  *
 22  *  Please cite the author in any work or product based on this material.
 23  *
 24  * ===========================================================================
 25  *
 26  * Authors:  Mike DiCuccio
 27  *
 28  * File Description:
 29  *
 30  */
 31 
 32 #include <ncbi_pch.hpp>
 33 #include <util/dictionary.hpp>
 34 #include <util/dictionary_util.hpp>
 35 #include <algorithm>
 36 
 37 
 38 BEGIN_NCBI_SCOPE
 39 
 40 
 41 
 42 CSimpleDictionary::CSimpleDictionary(size_t meta_key_size)
 43     : m_MetaphoneKeySize(meta_key_size)
 44 {
 45 }
 46 
 47 
 48 CSimpleDictionary::CSimpleDictionary(const string& file,
 49                                      size_t meta_key_size)
 50     : m_MetaphoneKeySize(meta_key_size)
 51 {
 52     CNcbiIfstream istr(file.c_str());
 53     Read(istr);
 54 }
 55 
 56 
 57 CSimpleDictionary::CSimpleDictionary(CNcbiIstream& istr,
 58                                      size_t meta_key_size)
 59     : m_MetaphoneKeySize(meta_key_size)
 60 {
 61     Read(istr);
 62 }
 63 
 64 
 65 void CSimpleDictionary::Read(CNcbiIstream& istr)
 66 {
 67     string line;
 68     string metaphone;
 69     string word;
 70     while (NcbiGetlineEOL(istr, line)) {
 71         string::size_type pos = line.find_first_of("|");
 72         if (pos == string::npos) {
 73             word = line;
 74             CDictionaryUtil::GetMetaphone(word, &metaphone, m_MetaphoneKeySize);
 75         } else {
 76             metaphone = line.substr(0, pos);
 77             word = line.substr(pos + 1, line.length() - pos - 1);
 78         }
 79 
 80         // insert forward and reverse dictionary pairings
 81         m_ForwardDict.insert(m_ForwardDict.end(), word);
 82         TStringSet& word_set = m_ReverseDict[metaphone];
 83         word_set.insert(word_set.end(), word);
 84     }
 85 }
 86 
 87 void CSimpleDictionary::Write(CNcbiOstream& ostr) const
 88 {
 89     ITERATE (TReverseDict, iter, m_ReverseDict) {
 90         ITERATE (TStringSet, word_iter, iter->second) {
 91             ostr << iter->first << "|" << *word_iter << endl;
 92         }
 93     }
 94 }
 95 
 96 
 97 void CSimpleDictionary::AddWord(const string& word)
 98 {
 99     if (word.empty()) {
100         return;
101     }
102 
103     // compute the metaphone code
104     string metaphone;
105     CDictionaryUtil::GetMetaphone(word, &metaphone, m_MetaphoneKeySize);
106 
107     // insert forward and reverse dictionary pairings
108     m_ForwardDict.insert(word);
109     m_ReverseDict[metaphone].insert(word);
110 }
111 
112 
113 bool CSimpleDictionary::CheckWord(const string& word) const
114 {
115     TForwardDict::const_iterator iter = m_ForwardDict.find(word);
116     return (iter != m_ForwardDict.end());
117 }
118 
119 
120 void CSimpleDictionary::SuggestAlternates(const string& word,
121                                           TAlternates& alternates,
122                                           size_t max_alts) const
123 {
124     string metaphone;
125     CDictionaryUtil::GetMetaphone(word, &metaphone, m_MetaphoneKeySize);
126     list<TReverseDict::const_iterator> keys;
127     x_GetMetaphoneKeys(metaphone, keys);
128 
129     typedef set<SAlternate, SAlternatesByScore> TAltSet;
130     TAltSet words;
131 
132     SAlternate alt;
133     size_t count = 0;
134     ITERATE (list<TReverseDict::const_iterator>, key_iter, keys) {
135 
136         bool used = false;
137         ITERATE (TStringSet, set_iter, (*key_iter)->second) {
138             // score:
139             // start with edit distance
140             alt.score = 
141                 CDictionaryUtil::Score(word, metaphone,
142                                        *set_iter, (*key_iter)->first);
143             if (alt.score <= 0) {
144                 continue;
145             }
146 
147             _TRACE("  hit: "
148                    << metaphone << " <-> " << (*key_iter)->first
149                    << ", " << word << " <-> " << *set_iter
150                    << " (" << alt.score << ")");
151             used = true;
152             alt.alternate = *set_iter;
153             words.insert(alt);
154         }
155         count += used ? 1 : 0;
156     }
157 
158     _TRACE(word << ": "
159            << keys.size() << " keys searched "
160            << count << " keys used");
161 
162     if ( !words.empty() ) {
163         TAlternates alts;
164         TAltSet::const_iterator iter = words.begin();
165         alts.push_back(*iter);
166         TAltSet::const_iterator prev = iter;
167         for (++iter;
168              iter != words.end()  &&
169              (alts.size() < max_alts  ||  prev->score == iter->score);
170              ++iter) {
171             alts.push_back(*iter);
172             prev = iter;
173         }
174 
175         alternates.insert(alternates.end(), alts.begin(), alts.end());
176     }
177 }
178 
179 
180 void CSimpleDictionary::x_GetMetaphoneKeys(const string& metaphone,
181                                            list<TReverseDict::const_iterator>& keys) const
182 {
183     if ( !metaphone.length() ) {
184         return;
185     }
186 
187     const size_t max_meta_edit_dist = 1;
188     const CDictionaryUtil::EDistanceMethod method =
189         CDictionaryUtil::eEditDistance_Similar;
190 
191     string::const_iterator iter = metaphone.begin();
192     string::const_iterator end  = iter + max_meta_edit_dist + 1;
193 
194     size_t count = 0;
195     _TRACE("meta key: " << metaphone);
196     for ( ;  iter != end;  ++iter) {
197         string seed(1, *iter);
198         _TRACE("  meta key seed: " << seed);
199         TReverseDict::const_iterator lower = m_ReverseDict.lower_bound(seed);
200         for ( ;
201               lower != m_ReverseDict.end()  &&  lower->first[0] == *iter;
202               ++lower, ++count) {
203 
204             size_t dist =
205                 CDictionaryUtil::GetEditDistance(lower->first, metaphone,
206                                                  method);
207             if (dist > max_meta_edit_dist) {
208                 continue;
209             }
210 
211             keys.push_back(lower);
212         }
213     }
214 
215     _TRACE("exmained " << count << " keys, returning " << keys.size());
216 }
217 
218 
219 /////////////////////////////////////////////////////////////////////////////
220 ///
221 /// CMultiDictionary
222 ///
223 
224 struct SDictByPriority {
225     bool operator()(const CMultiDictionary::SDictionary& d1,
226                     const CMultiDictionary::SDictionary& d2) const
227     {
228         return (d1.priority < d2.priority);
229     }
230 };
231 
232 void CMultiDictionary::RegisterDictionary(IDictionary& dict, int priority)
233 {
234     SDictionary d;
235     d.dict.Reset(&dict);
236     d.priority = priority;
237 
238     m_Dictionaries.push_back(d);
239     std::sort(m_Dictionaries.begin(), m_Dictionaries.end(), SDictByPriority());
240 }
241 
242 
243 bool CMultiDictionary::CheckWord(const string& word) const
244 {
245     ITERATE (TDictionaries, iter, m_Dictionaries) {
246         if ( iter->dict->CheckWord(word) ) {
247             return true;
248         }
249     }
250 
251     return false;
252 }
253 
254 
255 void CMultiDictionary::SuggestAlternates(const string& word,
256                                          TAlternates& alternates,
257                                          size_t max_alts) const
258 {
259     TAlternates alts;
260     ITERATE (TDictionaries, iter, m_Dictionaries) {
261         iter->dict->SuggestAlternates(word, alts, max_alts);
262     }
263 
264     std::sort(alts.begin(), alts.end(), SAlternatesByScore());
265     if (alts.size() > max_alts) {
266         TAlternates::iterator prev = alts.begin() + max_alts;
267         TAlternates::iterator iter = prev;
268         ++iter;
269         for ( ;  iter != alts.end()  && iter->score == prev->score;  ++iter) {
270             prev = iter;
271         }
272         alts.erase(iter, alts.end());
273     }
274 
275     alternates.swap(alts);
276 }
277 
278 
279 CCachedDictionary::CCachedDictionary(IDictionary& dict)
280     : m_Dict(&dict)
281 {
282 }
283 
284 
285 bool CCachedDictionary::CheckWord(const string& word) const
286 {
287     return m_Dict->CheckWord(word);
288 }
289 
290 
291 void CCachedDictionary::SuggestAlternates(const string& word,
292                                           TAlternates& alternates,
293                                           size_t max_alts) const
294 {
295     TAltCache::iterator iter = m_Misses.find(word);
296     if (iter != m_Misses.end()) {
297         alternates = iter->second;
298         return;
299     }
300 
301     m_Dict->SuggestAlternates(word, m_Misses[word], max_alts);
302     alternates = m_Misses[word];
303 }
304 
305 
306 END_NCBI_SCOPE
307 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.