#ifndef UTIL___DIFF__HPP #define UTIL___DIFF__HPP /* $Id: diff.hpp 84743 2018-12-06 13:05:45Z ivanov $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Authors: Vladimir Ivanov * */ /// @file diff.hpp /// /// API to diff strings/texts and to find the differences. /// /// CDiff - Find the differences between two strings (character mode). /// CDiffText - Find the differences between two texts (line mode). /// CDiffList - The list of differences (diff operations). /// CDiffOperation - The storage class for one diff operation. /// CDiffException - Exceptions generated by Diff API. /// /// Based on algorithms from the Diff, Match and Patch Library: /// http://code.google.com/p/google-diff-match-patch/ /// Also, uses Diff Template Library: /// http://code.google.com/p/dtl-cpp/ /// #include <corelib/ncbistd.hpp> #include <corelib/ncbitime.hpp> /** @addtogroup Diff * * @{ */ BEGIN_NCBI_SCOPE ///////////////////////////////////////////////////////////////////////////// /// /// CDiffOperation -- The storage class for one diff operation. /// class NCBI_XUTIL_EXPORT CDiffOperation { public: // Type definitions typedef size_t size_type; /// Type of the current diff operation. enum EType { eDelete, ///< String that exists in old text, and deleted in new one eEqual, ///< String exists in both texts eInsert ///< String that exists in new text, and deleted in old one }; /// Structure to save offset/length in the compared strings struct SPos { size_type first; ///< Position in first (original) string size_type second; ///< Position in second (result) string SPos(size_type p1 = NPOS, size_type p2 = NPOS) : first(p1), second(p2) {} }; /// Constructor /// /// @param operation /// Type of the current diff operation. /// @param str /// The string associated with this diff operation. CDiffOperation(EType operation, CTempString str); /// Get type of operation EType GetOperation() const { return m_Operation; } /// Check type of operation bool IsInsert(void) const { return m_Operation == eInsert; } bool IsDelete(void) const { return m_Operation == eDelete; } bool IsEqual (void) const { return m_Operation == eEqual; } /// Get string for current operation CTempString GetString(void) const { return m_String; } /// Get string length. /// /// For character-based diffs it is the same as GetString().length(). /// But in the line-based diffs the string can be truncated /// and it may not have ending EOL. So, the original string can be /// reconstructed using: /// CTempString(op.GetString().data(), op.GetLength()) /// size_type GetLength(void) const { return m_Length; } /// Get offset of the current difference in the original text. /// /// Return structure with offsets in the first (original) /// and in the second (result) strings. /// /// For performance reasons, Diff() does not operate with offsets of /// the substrings and it does not store them. Original strings /// can be easily reconstructed from the list of diffs, and offsets /// also can be calculated afterwards. /// See CDiffList::CalculateOffsets(). /// @return /// - Zero-based offset of the current difference in the original /// text. The offset is counting from the start of the text for //// both block- and line-based modes. /// - NPOS if offset is not calculated or unknown. /// - GetOffset().first represents offsets for DELETE and EQUAL /// operation only, GetOffset().second for INSERT and EQUAL only. /// @sa /// GetOperation(), GetString(), GetLine(), /// CDiff::Diff(), CDiffList::CalculateOffsets() SPos GetOffset(void) const; /// Get line number of the current difference (line-mode diff only). /// Line numbers are 1-based. /// /// @return /// - 1-based line number of the current difference in line-mode /// only. Always return NPOS in block-mode. /// - GetLine().first returns line number for DELETE and EQUAL /// operations only, GetLine().second for INSERT and EQUAL only. /// - NPOS otherwise. /// @sa /// GetOperation(), GetString(), GetOffset(), CDiff::DiffLines() SPos GetLine(void) const; /// Compare substrings and operation types only, all other attributes /// (offset, row, col and etc) are not used in comparison. bool operator== (const CDiffOperation& diff_op) const; bool operator!= (const CDiffOperation& diff_op) const; protected: // Setters void SetOperation(EType operation); void SetString(CTempString str, size_type len = NPOS); void SetLength(size_type length); void SetOffset(SPos offset); void SetLine(SPos line); friend class CDiffList; friend class CDiff; friend class CDiffText; private: EType m_Operation; ///< Type of the current diff operation CTempString m_String; ///< The string associated with this diff operation size_type m_Length; ///< The real length of the m_String ///< (including EOL, if it was truncated in line-mode diff) SPos m_Offset; ///< The offsets of the current string in the original string (or NPOS) SPos m_Line; ///< Line numbers of the current diff (or NPOS) -- line-mode only }; ///////////////////////////////////////////////////////////////////////////// /// /// CDiffList -- the list of diff operations. /// class NCBI_XUTIL_EXPORT CDiffList { public: /// Size type definition typedef CDiffOperation::size_type size_type; /// Storage class type for the list of diff operations typedef list<CDiffOperation> TList; /// Constructor. CDiffList(void) {} /// Compute the edit distance (Levenshtein distance). /// /// Can be used to check how much strings differ. /// @return /// Return number of single-character edits in the current difference /// list, including the number of inserted, deleted or substituted /// characters, required to be done to convert one string to another. /// Return zero if diffed strings are equal. size_type GetEditDistance(void) const; /// Find the longest common substring for current difference list. /// /// @return /// The algorithm just returns the first longest common substring /// it finds, which therefore may not be unique. /// Return empty string if compared strings where fully different /// and no common substrings where found. /// @note /// If current difference list is not optimal, the result can /// be shorter than it should be. Use CleanupAndMerge() first /// to get more optimized diff. /// @note /// For not optimized line-based diffs it return a first longest /// common line. CTempString GetLongestCommonSubstring(void) const; /// Reorder and merge like edit sections, merge equalities. /// /// This method can make resulting list cleaner and shorter, /// but this can take some time. void CleanupAndMerge(void); /// Calculate offsets for all substrings in the difference list /// and find its position from the start of the original strings, /// used in diff. /// /// For performance reasons, Diff() do not operate with offsets of /// the substrings and do not store them, but as original strings /// can be easy reconstructed from the list of diffs, that offsets /// also can be calculated afterwards. /// /// For DELETE or EQUAl type operations the calculated value is /// an offset of the substring in the first (original) string /// used in diff. For INSERT type operation the calculated value /// is an offset in the second (result) string used in diff. /// /// @note /// This method cannot calculate rows and columns for regular /// diffs, use line-mode diffs that do this automatically. /// @sa /// CDiffOperation::GetOffset() void CalculateOffsets(void); public: /// Get list of the diff operations as list<>. const TList& GetList(void) const; /// Remove all elements from the list void Clear(void); /// Add element to the front of the list. void Prepend(CDiffOperation::EType operation, CTempString str); /// Add element to the front of the list void Prepend(const CDiffOperation& op); /// Add element to the end of the list. void Append(CDiffOperation::EType operation, CTempString str); /// Add element to the end of the list. void Append(const CDiffOperation& op); private: /// Merge adjacent parts with the same operation. /// Any edit section can move as long as it doesn't cross an equality. void x_CleanupAndMerge_Equities(void); /// Look for single edits surrounded on both sides by equalities which /// can be shifted sideways to eliminate an equality. /// e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC /// Return TRUE if shifts were made and the diff needs reordering /// and another shift sweep bool x_CleanupAndMerge_SingleEdits(void); private: TList m_List; ///< List of the differences friend class CDiff; friend class CDiffText; }; ///////////////////////////////////////////////////////////////////////////// /// /// CDiffBase -- Diff and compare texts (base class) /// /// Do not use this class directly, use inherited classes CDiff or CDiffText. class NCBI_XUTIL_EXPORT CDiffBase { public: /// Type definition. typedef CDiffOperation::size_type size_type; /// Set timeout. /// /// Despite the large number of optimizations used in Diff(), it can take /// a while to compute. You can limit its execution time using this method. /// The default value for timeout if not specified is infinity, that lets /// diff run until completion. Should diff timeout, the return value will /// still be a valid difference, though probably non-optimal. void SetTimeout(const CTimeout& tmo) { m_Timeout = tmo; } /// Check if timeout is expired. bool IsTimeoutExpired() const; private: /// Constructor CDiffBase(void) : m_Timeout(CTimeout::eInfinite), m_Deadline(NULL) { } protected: /// Reset internal state and prepare to next Diff() void Reset(void); protected: CDiffList m_Diffs; ///< The list of differences from the last diff CTimeout m_Timeout; ///< Relative timeout for processing CDeadline* m_Deadline; ///< Deadline for processing (NULL if not set) friend class CDiff; friend class CDiffText; }; ///////////////////////////////////////////////////////////////////////////// /// /// CDiff -- Diff and compare strings (character mode) /// /// Throw exception on error. class NCBI_XUTIL_EXPORT CDiff : public CDiffBase { public: /// Processing flags. enum EFlags { /// By default Diff() automatically call CleanupAndMerge() for the /// generated list of differences to have a shorter and cleaner list /// of results. It is on by default for regular character-based diff. /// But if you need faster (and less optimal) result, you can skip it. fNoCleanup = 1 << 0, /// Automatically call CalculateOffests() for the generated list /// of differences. fCalculateOffsets = 1 << 1 }; typedef unsigned int TFlags; ///< Bitwise OR of "EFlags" /// Find the differences between two texts (character mode). /// /// @param s1 /// Old string to be diffed. /// @param s2 /// New string to be diffed. /// @flags /// Processing flags. /// @return /// The list of operations that should be done to convert string 's1' to 's2'. /// @sa SetTimeout(), CDiffText CDiffList& Diff(CTempString s1, CTempString s2, TFlags flags = 0); private: /// Five element array for the list of strings, returned by x_DiffHalfMatch() typedef vector<CTempString> TDiffHalfMatchList; // Auxiliary methods for Diff() bool x_DiffHalfMatch (CTempString s1, CTempString s2, TDiffHalfMatchList& hm) const; bool x_DiffHalfMatchI (CTempString long_str, CTempString short_str, size_type i, TDiffHalfMatchList& hm) const; void x_DiffBisect (CTempString s1, CTempString s2, CDiffList& diffs) const; void x_DiffBisectSplit(CTempString s1, CTempString s2, int x, int y, CDiffList& diffs) const; void x_Diff(CTempString s1, CTempString s2, CDiffList& diffs) const; }; ///////////////////////////////////////////////////////////////////////////// /// /// CDiffText -- Diff and compare texts (line mode) /// /// Throw exception on error. class NCBI_XUTIL_EXPORT CDiffText : public CDiffBase { public: /// Processing flags. enum EFlags { /// Automatically call CleanupAndMerge() for the generated list /// of differences after Diff() to have a shorter and cleaner /// list of results. It is off by default for line-based diffs. fCleanup = 1 << 0, /// Automatically call CalculateOffests() for the generated list /// of differences. fCalculateOffsets = 1 << 1, /// Ignore differences in end-of-line types on string comparison. fIgnoreEOL = 1 << 2, /// Remove end-of-line symbols from each string added to the list, /// which can be obtained using CDiffOperation::GetString(). /// This flag is incompatible with fCleanup, exception will be /// thrown if they where used together. The fRemoveEOL affect /// a visual appearance of the string, and do not have an influence /// on how lines with different kind of EOLs compares, /// use fIgnoreEOL in addition if necessary. fRemoveEOL = 1 << 3 }; typedef unsigned int TFlags; ///< Bitwise OR of "Flags" /// Find the differences between two texts (line mode). /// /// Run a line-based diff, that operates with whole lines, and not trying /// to find a differences inside added/deleted blocks of lines. /// @param text1 /// Old text to be diffed. /// @param text2 /// New text to be diffed. /// @flags /// Processing flags. /// Default: non-optimal line-based diff that have all EOLs. /// @return /// The list of operations that should be done to convert text 's1' to 's2'. /// @sa SetTimeout(), CDiff CDiffList& Diff(CTempString text1, CTempString text2, TFlags flags = 0); /// Find the differences between two texts and print result /// into output stream in unified-like format. /// /// @param out /// The output stream. /// @param text1 /// Old text to be diffed. /// @param text2 /// New text to be diffed. /// @param num_common_lines /// The number of unchanged lines shown above and below a change hunk. /// Three lines is typically the default. Use zero, if you need to see /// a changes only, without any surrounding text. /// @sa /// Diff() /// @note /// Using CTempStrings internally is faster and use less memory, but do not /// allow to perform some optimizations and cleanup, so result can looks /// different that 'diff' utility provide, and have more hunks. /// Also, the printed result have next differences from canonical unified /// format: /// 1) Unified format allow crossed hunks, when end of the first and start /// of the second hunk have the same unchanged lines of text. This version /// don't have crossed hunks. If some hunks crosses, the second will be /// started with less number of unchanged lines that specified by /// 'num_common_lines' parameter. /// 2) Unified format can have zero-based line numbers. like: /// @@ -0,0 +1,1 @@ /// +Added line. /// This version always use one-based line numbers: /// @@ -1,0 +1,1 @@ /// +Added line. /// @note /// It is not fully compatible with result provided by 'diff -u' (or -U NNN), /// and sometimes it is impossible to use 'patch' utility with provided diff. CNcbiOstream& PrintUnifiedDiff(CNcbiOstream& out, CTempString text1, CTempString text2, unsigned int num_common_lines = 3); }; ///////////////////////////////////////////////////////////////////////////// /// /// CDiffException -- /// /// Define exceptions generated by CDiff. class NCBI_XUTIL_EXPORT CDiffException : public CException { public: enum EErrCode { eEmpty, eBadFlags }; virtual const char* GetErrCodeString(void) const override; NCBI_EXCEPTION_DEFAULT(CDiffException, CException); }; /* @} */ ////////////////////////////////////////////////////////////////////////////// // // Inline // // // CDiffOperation // inline bool CDiffOperation::operator== (const CDiffOperation& diff_op) const { return (diff_op.m_Operation == this->m_Operation) && (diff_op.m_String == this->m_String); } inline bool CDiffOperation::operator!= (const CDiffOperation& diff_op) const { return !(operator == (diff_op)); } inline CDiffOperation::SPos CDiffOperation::GetOffset(void) const { return m_Offset; } inline CDiffOperation::SPos CDiffOperation::GetLine(void) const { return m_Line; } inline void CDiffOperation::SetOperation(EType operation) { m_Operation = operation; } inline void CDiffOperation::SetLength(size_type length) { m_Length = length; } inline void CDiffOperation::SetString(CTempString str, size_type len) { m_String = str; m_Length = (len == NPOS) ? str.length() : len; } inline void CDiffOperation::SetOffset(SPos offset) { m_Offset = offset; } inline void CDiffOperation::SetLine(SPos line) { m_Line = line; } // // CDiffList // inline const CDiffList::TList& CDiffList::GetList(void) const { return m_List; } inline void CDiffList::Prepend(CDiffOperation::EType operation, CTempString str) { if (str.length()) { m_List.push_front(CDiffOperation(operation, str)); } } inline void CDiffList::Prepend(const CDiffOperation& op) { m_List.push_front(op); } inline void CDiffList::Append(CDiffOperation::EType operation, CTempString str) { if (str.length()) { m_List.push_back(CDiffOperation(operation, str)); } } inline void CDiffList::Append(const CDiffOperation& op) { m_List.push_back(op); } inline void CDiffList::Clear(void) { m_List.clear(); } // // CDiffBase // inline void CDiffBase::Reset(void) { m_Diffs.Clear(); m_Deadline = NULL; } inline bool CDiffBase::IsTimeoutExpired() const { if (!m_Deadline) { return false; } return m_Deadline->IsExpired(); } END_NCBI_SCOPE #endif /* UTIL___DIFF__HPP */
0001 0002 0003 0004 0005 0006 0007 0008 0009 0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 0100 0101 0102 0103 0104 0105 0106 0107 0108 0109 0110 0111 0112 0113 0114 0115 0116 0117 0118 0119 0120 0121 0122 0123 0124 0125 0126 0127 0128 0129 0130 0131 0132 0133 0134 0135 0136 0137 0138 0139 0140 0141 0142 0143 0144 0145 0146 0147 0148 0149 0150 0151 0152 0153 0154 0155 0156 0157 0158 0159 0160 0161 0162 0163 0164 0165 0166 0167 0168 0169 0170 0171 0172 0173 0174 0175 0176 0177 0178 0179 0180 0181 0182 0183 0184 0185 0186 0187 0188 0189 0190 0191 0192 0193 0194 0195 0196 0197 0198 0199 0200 0201 0202 0203 0204 0205 0206 0207 0208 0209 0210 0211 0212 0213 0214 0215 0216 0217 0218 0219 0220 0221 0222 0223 0224 0225 0226 0227 0228 0229 0230 0231 0232 0233 0234 0235 0236 0237 0238 0239 0240 0241 0242 0243 0244 0245 0246 0247 0248 0249 0250 0251 0252 0253 0254 0255 0256 0257 0258 0259 0260 0261 0262 0263 0264 0265 0266 0267 0268 0269 0270 0271 0272 0273 0274 0275 0276 0277 0278 0279 0280 0281 0282 0283 0284 0285 0286 0287 0288 0289 0290 0291 0292 0293 0294 0295 0296 0297 0298 0299 0300 0301 0302 0303 0304 0305 0306 0307 0308 0309 0310 0311 0312 0313 0314 0315 0316 0317 0318 0319 0320 0321 0322 0323 0324 0325 0326 0327 0328 0329 0330 0331 0332 0333 0334 0335 0336 0337 0338 0339 0340 0341 0342 0343 0344 0345 0346 0347 0348 0349 0350 0351 0352 0353 0354 0355 0356 0357 0358 0359 0360 0361 0362 0363 0364 0365 0366 0367 0368 0369 0370 0371 0372 0373 0374 0375 0376 0377 0378 0379 0380 0381 0382 0383 0384 0385 0386 0387 0388 0389 0390 0391 0392 0393 0394 0395 0396 0397 0398 0399 0400 0401 0402 0403 0404 0405 0406 0407 0408 0409 0410 0411 0412 0413 0414 0415 0416 0417 0418 0419 0420 0421 0422 0423 0424 0425 0426 0427 0428 0429 0430 0431 0432 0433 0434 0435 0436 0437 0438 0439 0440 0441 0442 0443 0444 0445 0446 0447 0448 0449 0450 0451 0452 0453 0454 0455 0456 0457 0458 0459 0460 0461 0462 0463 0464 0465 0466 0467 0468 0469 0470 0471 0472 0473 0474 0475 0476 0477 0478 0479 0480 0481 0482 0483 0484 0485 0486 0487 0488 0489 0490 0491 0492 0493 0494 0495 0496 0497 0498 0499 0500 0501 0502 0503 0504 0505 0506 0507 0508 0509 0510 0511 0512 0513 0514 0515 0516 0517 0518 0519 0520 0521 0522 0523 0524 0525 0526 0527 0528 0529 0530 0531 0532 0533 0534 0535 0536 0537 0538 0539 0540 0541 0542 0543 0544 0545 0546 0547 0548 0549 0550 0551 0552 0553 0554 0555 0556 0557 0558 0559 0560 0561 0562 0563 0564 0565 0566 0567 0568 0569 0570 0571 0572 0573 0574 0575 0576 0577 0578 0579 0580 0581 0582 0583 0584 0585 0586 0587 0588 0589 0590 0591 0592 0593 0594 0595 0596 0597 0598 0599 0600 0601 0602 0603 0604 0605 0606 0607 0608 0609 0610 0611 0612 0613 0614 0615 0616 0617 0618 0619 0620 0621