#ifndef UTIL___DIFF__HPP
#define UTIL___DIFF__HPP

/*  $Id: diff.hpp 84743 2018-12-06 13:05:45Z ivanov $
 * ===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 *
 * Authors:  Vladimir Ivanov
 *
 */

/// @file diff.hpp
///
/// API to diff strings/texts and to find the differences.
///
/// CDiff          - Find the differences between two strings (character mode).
/// CDiffText      - Find the differences between two texts (line mode).
/// CDiffList      - The list of differences (diff operations).
/// CDiffOperation - The storage class for one diff operation.
/// CDiffException - Exceptions generated by Diff API.
///
/// Based on algorithms from the Diff, Match and Patch Library:
///     http://code.google.com/p/google-diff-match-patch/
/// Also, uses Diff Template Library:
///     http://code.google.com/p/dtl-cpp/
///

#include <corelib/ncbistd.hpp>
#include <corelib/ncbitime.hpp>

/** @addtogroup Diff
 *
 * @{
 */

BEGIN_NCBI_SCOPE


/////////////////////////////////////////////////////////////////////////////
///
/// CDiffOperation -- The storage class for one diff operation.
///

class NCBI_XUTIL_EXPORT CDiffOperation
{
public:
    // Type definitions
    typedef size_t size_type;

    /// Type of the current diff operation.
    enum EType {
        eDelete,  ///< String that exists in old text, and deleted in new one
        eEqual,   ///< String exists in both texts
        eInsert   ///< String that exists in new text, and deleted in old one
    };

    /// Structure to save offset/length in the compared strings
    struct SPos {
        size_type first;  ///< Position in first (original) string
        size_type second; ///< Position in second (result) string
        SPos(size_type p1 = NPOS, size_type p2 = NPOS)
            : first(p1), second(p2) {}
    };

    /// Constructor
    ///
    /// @param operation
    ///   Type of the current diff operation.
    /// @param str
    ///   The string associated with this diff operation.
    CDiffOperation(EType operation, CTempString str);

    /// Get type of operation
    EType GetOperation() const { return m_Operation; }

    /// Check type of operation
    bool IsInsert(void)  const { return m_Operation == eInsert; }
    bool IsDelete(void)  const { return m_Operation == eDelete; }
    bool IsEqual (void)  const { return m_Operation == eEqual;  }

    /// Get string for current operation
    CTempString GetString(void) const { return m_String; }

    /// Get string length.
    ///
    /// For character-based diffs it is the same as GetString().length().
    /// But in the line-based diffs the string can be truncated
    /// and it may not have ending EOL. So, the original string can be
    /// reconstructed using:
    ///   CTempString(op.GetString().data(), op.GetLength())
    /// 
    size_type GetLength(void) const { return m_Length; }

    /// Get offset of the current difference in the original text.
    ///
    /// Return structure with offsets in the first (original)
    /// and in the second (result) strings.
    ///
    /// For performance reasons, Diff() does not operate with offsets of
    /// the substrings and it does not store them. Original strings
    /// can be easily reconstructed from the list of diffs, and offsets
    /// also can be calculated afterwards.
    /// See CDiffList::CalculateOffsets().
    /// @return
    ///    - Zero-based offset of the current difference in the original
    ///      text. The offset is counting from the start of the text for
    ////     both block- and line-based modes.
    ///    - NPOS if offset is not calculated or unknown.
    ///    - GetOffset().first represents offsets for DELETE and EQUAL
    ///      operation only, GetOffset().second for INSERT and EQUAL only.
    /// @sa
    ///   GetOperation(), GetString(), GetLine(),
    ///   CDiff::Diff(), CDiffList::CalculateOffsets()
    SPos GetOffset(void) const;

    /// Get line number of the current difference (line-mode diff only).
    /// Line numbers are 1-based.
    ///
    /// @return 
    ///    - 1-based line number of the current difference in line-mode
    ///      only. Always return NPOS in block-mode.
    ///    - GetLine().first returns line number for DELETE and EQUAL
    ///      operations only, GetLine().second for INSERT and EQUAL only.
    ///    - NPOS otherwise.
    /// @sa
    ///   GetOperation(), GetString(), GetOffset(), CDiff::DiffLines()
    SPos GetLine(void) const;

    /// Compare substrings and operation types only, all other attributes
    /// (offset, row, col and etc) are not used in comparison.
    bool operator== (const CDiffOperation& diff_op) const;
    bool operator!= (const CDiffOperation& diff_op) const;

protected:
    // Setters
    void SetOperation(EType operation);
    void SetString(CTempString str, size_type len = NPOS);
    void SetLength(size_type length);
    void SetOffset(SPos offset);
    void SetLine(SPos line);

    friend class CDiffList;
    friend class CDiff;
    friend class CDiffText;

private:
    EType        m_Operation; ///< Type of the current diff operation
    CTempString  m_String;    ///< The string associated with this diff operation
    size_type    m_Length;    ///< The real length of the m_String
                              ///< (including EOL, if it was truncated in line-mode diff)
    SPos m_Offset;  ///< The offsets of the current string in the original string (or NPOS)
    SPos m_Line;    ///< Line numbers of the current diff (or NPOS) -- line-mode only
};




/////////////////////////////////////////////////////////////////////////////
///
/// CDiffList -- the list of diff operations.
///

class NCBI_XUTIL_EXPORT CDiffList
{
public:
    /// Size type definition
    typedef CDiffOperation::size_type size_type;
    /// Storage class type for the list of diff operations
    typedef list<CDiffOperation> TList;

    /// Constructor.
    CDiffList(void) {}

    /// Compute the edit distance (Levenshtein distance).
    ///
    /// Can be used to check how much strings differ.
    /// @return
    ///   Return number of single-character edits in the current difference
    ///   list, including the number of inserted, deleted or substituted
    ///   characters, required to be done to convert one string to another.
    ///   Return zero if diffed strings are equal.
    size_type GetEditDistance(void) const;

    /// Find the longest common substring for current difference list.
    ///
    /// @return
    ///    The algorithm just returns the first longest common substring
    ///    it finds, which therefore may not be unique.
    ///    Return empty string if compared strings where fully different 
    ///    and no common substrings where found.
    /// @note
    ///   If current difference list is not optimal, the result can
    ///   be shorter than it should be. Use CleanupAndMerge() first
    ///   to get more optimized diff.
    /// @note
    ///   For not optimized line-based diffs it return a first longest
    ///   common line.
    CTempString GetLongestCommonSubstring(void) const;

    /// Reorder and merge like edit sections, merge equalities.
    ///
    /// This method can make resulting list cleaner and shorter,
    /// but this can take some time.
    void CleanupAndMerge(void);

    /// Calculate offsets for all substrings in the difference list
    /// and find its position from the start of the original strings,
    /// used in diff. 
    ///
    /// For performance reasons, Diff() do not operate with offsets of
    /// the substrings and do not store them, but as original strings
    /// can be easy reconstructed from the list of diffs, that offsets
    /// also can be calculated afterwards. 
    ///
    /// For DELETE or EQUAl type operations the calculated value is
    /// an offset of the substring in the first (original) string
    /// used in diff. For INSERT type operation the calculated value
    /// is an offset in the second (result) string used in diff.
    ///
    /// @note
    ///   This method cannot calculate rows and columns for regular
    ///   diffs, use line-mode diffs that do this automatically.
    /// @sa
    ///   CDiffOperation::GetOffset()
    void CalculateOffsets(void);

public:
    /// Get list of the diff operations as list<>.
    const TList& GetList(void) const;
    /// Remove all elements from the list
    void Clear(void);

    /// Add element to the front of the list.
    void Prepend(CDiffOperation::EType operation, CTempString str);
    /// Add element to the front of the list
    void Prepend(const CDiffOperation& op);
    /// Add element to the end of the list.
    void Append(CDiffOperation::EType operation, CTempString str);
    /// Add element to the end of the list.
    void Append(const CDiffOperation& op);

private:
    /// Merge adjacent parts with the same operation.
    /// Any edit section can move as long as it doesn't cross an equality.
    void x_CleanupAndMerge_Equities(void);

    /// Look for single edits surrounded on both sides by equalities which
    /// can be shifted sideways to eliminate an equality.
    /// e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
    /// Return TRUE if shifts were made and the diff needs reordering
    /// and another shift sweep
    bool x_CleanupAndMerge_SingleEdits(void);

private:
    TList m_List;  ///< List of the differences
    friend class CDiff;
    friend class CDiffText;
};


/////////////////////////////////////////////////////////////////////////////
///
/// CDiffBase -- Diff and compare texts (base class)
///
/// Do not use this class directly, use inherited classes CDiff or CDiffText.

class NCBI_XUTIL_EXPORT CDiffBase
{
public:
    /// Type definition.
    typedef CDiffOperation::size_type size_type;

    /// Set timeout.
    ///
    /// Despite the large number of optimizations used in Diff(), it can take
    /// a while to compute. You can limit its execution time using this method.
    /// The default value for timeout if not specified is infinity, that lets
    /// diff run until completion. Should diff timeout, the return value will
    /// still be a valid difference, though probably non-optimal. 
    void SetTimeout(const CTimeout& tmo) { m_Timeout = tmo; }

    /// Check if timeout is expired.
    bool IsTimeoutExpired() const;

private:
    /// Constructor
    CDiffBase(void)
        : m_Timeout(CTimeout::eInfinite), m_Deadline(NULL)
    { }

protected:
    /// Reset internal state and prepare to next Diff()
    void Reset(void);

protected:
    CDiffList   m_Diffs;     ///< The list of differences from the last diff
    CTimeout    m_Timeout;   ///< Relative timeout for processing
    CDeadline*  m_Deadline;  ///< Deadline for processing (NULL if not set)

    friend class CDiff;
    friend class CDiffText;
};


/////////////////////////////////////////////////////////////////////////////
///
/// CDiff -- Diff and compare strings (character mode)
///
/// Throw exception on error.

class NCBI_XUTIL_EXPORT CDiff : public CDiffBase
{
public:
    /// Processing flags.
    enum EFlags {
        /// By default Diff() automatically call CleanupAndMerge() for the
        /// generated list of differences to have a shorter and cleaner list
        /// of results. It is on by default for regular character-based diff.
        /// But if you need faster (and less optimal) result, you can skip it.
        fNoCleanup        = 1 << 0,
        /// Automatically call CalculateOffests() for the generated list
        /// of differences.
        fCalculateOffsets = 1 << 1
    };
    typedef unsigned int TFlags;  ///< Bitwise OR of "EFlags"

    /// Find the differences between two texts (character mode).
    ///
    /// @param s1 
    ///   Old string to be diffed.
    /// @param s2
    ///   New string to be diffed.
    /// @flags
    ///   Processing flags.
    /// @return
    ///   The list of operations that should be done to convert string 's1' to 's2'.
    /// @sa SetTimeout(), CDiffText
    CDiffList& Diff(CTempString s1, CTempString s2, TFlags flags = 0);

private:
    /// Five element array for the list of strings, returned by x_DiffHalfMatch()
    typedef vector<CTempString> TDiffHalfMatchList;

    // Auxiliary methods for Diff()
    bool x_DiffHalfMatch  (CTempString s1, CTempString s2,
                           TDiffHalfMatchList& hm) const;
    bool x_DiffHalfMatchI (CTempString long_str, CTempString short_str,
                           size_type i, TDiffHalfMatchList& hm) const;
    void x_DiffBisect     (CTempString s1, CTempString s2,
                           CDiffList& diffs) const;
    void x_DiffBisectSplit(CTempString s1, CTempString s2, int x, int y,
                           CDiffList& diffs) const;
    void x_Diff(CTempString s1, CTempString s2, CDiffList& diffs) const;
};


/////////////////////////////////////////////////////////////////////////////
///
/// CDiffText -- Diff and compare texts (line mode)
///
/// Throw exception on error.

class NCBI_XUTIL_EXPORT CDiffText : public CDiffBase
{
public:
    /// Processing flags.
    enum EFlags {
        /// Automatically call CleanupAndMerge() for the generated list
        /// of differences after Diff() to have a shorter and cleaner
        /// list of results. It is off by default for line-based diffs.
        fCleanup          = 1 << 0,
        /// Automatically call CalculateOffests() for the generated list
        /// of differences.
        fCalculateOffsets = 1 << 1,
        /// Ignore differences in end-of-line types on string comparison.
        fIgnoreEOL        = 1 << 2,
        /// Remove end-of-line symbols from each string added to the list,
        /// which can be obtained using CDiffOperation::GetString().
        /// This flag is incompatible with fCleanup, exception will be
        /// thrown if they where used together. The fRemoveEOL affect
        /// a visual appearance of the string, and do not have an influence
        /// on how lines with different kind of EOLs compares, 
        /// use fIgnoreEOL in addition if necessary.
        fRemoveEOL        = 1 << 3
    };
    typedef unsigned int TFlags;   ///< Bitwise OR of "Flags"

    /// Find the differences between two texts (line mode).
    ///
    /// Run a line-based diff, that operates with whole lines, and not trying
    /// to find a differences inside added/deleted blocks of lines.
    /// @param text1 
    ///   Old text to be diffed.
    /// @param text2
    ///   New text to be diffed.
    /// @flags
    ///   Processing flags.
    ///   Default: non-optimal line-based diff that have all EOLs.
    /// @return
    ///   The list of operations that should be done to convert text 's1' to 's2'.
    /// @sa SetTimeout(), CDiff
    CDiffList& Diff(CTempString text1, CTempString text2, TFlags flags = 0);

    /// Find the differences between two texts and print result
    /// into output stream in unified-like format.
    ///
    /// @param out
    ///   The output stream.
    /// @param text1 
    ///   Old text to be diffed.
    /// @param text2
    ///   New text to be diffed.
    /// @param num_common_lines
    ///   The number of unchanged lines shown above and below a change hunk.
    ///   Three lines is typically the default. Use zero, if you need to see
    ///   a changes only, without any surrounding text.
    /// @sa
    ///   Diff()
    /// @note
    ///   Using CTempStrings internally is faster and use less memory, but do not
    ///   allow to perform some optimizations and cleanup, so result can looks
    ///   different that 'diff' utility provide, and have more hunks.
    ///   Also, the printed result have next differences from canonical unified
    ///   format:
    ///   1) Unified format allow crossed hunks, when end of the first and start
    ///      of the second hunk have the same unchanged lines of text. This version
    ///      don't have crossed hunks. If some hunks crosses, the second will be
    ///      started with less number of unchanged lines that specified by
    ///      'num_common_lines' parameter.
    ///   2) Unified format can have zero-based line numbers. like:
    ///          @@ -0,0 +1,1 @@
    ///          +Added line.
    ///      This version always use one-based line numbers:
    ///          @@ -1,0 +1,1 @@
    ///          +Added line.
    /// @note
    ///   It is not fully compatible with result provided by 'diff -u' (or -U NNN),
    ///   and sometimes it is impossible to use 'patch' utility with provided diff.
    CNcbiOstream& PrintUnifiedDiff(CNcbiOstream& out,
                                   CTempString text1, CTempString text2,
                                   unsigned int num_common_lines = 3);
};


/////////////////////////////////////////////////////////////////////////////
///
/// CDiffException --
///
/// Define exceptions generated by CDiff.

class NCBI_XUTIL_EXPORT CDiffException : public CException
{
public:
    enum EErrCode {
        eEmpty,
        eBadFlags
    };
    virtual const char* GetErrCodeString(void) const override;
    NCBI_EXCEPTION_DEFAULT(CDiffException, CException);
};


/* @} */


//////////////////////////////////////////////////////////////////////////////
//
// Inline
//

//
// CDiffOperation
//

inline
bool CDiffOperation::operator== (const CDiffOperation& diff_op) const
{
    return (diff_op.m_Operation == this->m_Operation) &&
            (diff_op.m_String == this->m_String);
}

inline
bool CDiffOperation::operator!= (const CDiffOperation& diff_op) const
{
    return !(operator == (diff_op));
}

inline
CDiffOperation::SPos CDiffOperation::GetOffset(void) const
{
    return m_Offset;
}

inline
CDiffOperation::SPos CDiffOperation::GetLine(void) const
{
    return m_Line;
}

inline
void CDiffOperation::SetOperation(EType operation)
{
    m_Operation = operation;
}

inline
void CDiffOperation::SetLength(size_type length)
{
    m_Length = length;
}

inline
void CDiffOperation::SetString(CTempString str, size_type len)
{
    m_String = str;
    m_Length = (len == NPOS) ? str.length() : len;
}

inline
void CDiffOperation::SetOffset(SPos offset)
{
    m_Offset = offset;
}

inline
void CDiffOperation::SetLine(SPos line)
{
    m_Line = line;
}


//
// CDiffList
//

inline
const CDiffList::TList& CDiffList::GetList(void) const 
{
    return m_List;
}

inline
void CDiffList::Prepend(CDiffOperation::EType operation, CTempString str)
{
    if (str.length()) {
        m_List.push_front(CDiffOperation(operation, str));
    }
}

inline
void CDiffList::Prepend(const CDiffOperation& op)
{
    m_List.push_front(op);
}

inline
void CDiffList::Append(CDiffOperation::EType operation, CTempString str)
{
    if (str.length()) {
        m_List.push_back(CDiffOperation(operation, str));
    }
}

inline
void CDiffList::Append(const CDiffOperation& op)
{
    m_List.push_back(op);
}

inline
void CDiffList::Clear(void) 
{
    m_List.clear();
}


//
// CDiffBase
//

inline
void CDiffBase::Reset(void)
{
    m_Diffs.Clear();
    m_Deadline = NULL;
}

inline
bool CDiffBase::IsTimeoutExpired() const
{
    if (!m_Deadline) {
        return false;
    }
    return m_Deadline->IsExpired();
}


END_NCBI_SCOPE

#endif  /* UTIL___DIFF__HPP */
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281
0282
0283
0284
0285
0286
0287
0288
0289
0290
0291
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305
0306
0307
0308
0309
0310
0311
0312
0313
0314
0315
0316
0317
0318
0319
0320
0321
0322
0323
0324
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396
0397
0398
0399
0400
0401
0402
0403
0404
0405
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415
0416
0417
0418
0419
0420
0421
0422
0423
0424
0425
0426
0427
0428
0429
0430
0431
0432
0433
0434
0435
0436
0437
0438
0439
0440
0441
0442
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490
0491
0492
0493
0494
0495
0496
0497
0498
0499
0500
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513
0514
0515
0516
0517
0518
0519
0520
0521
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537
0538
0539
0540
0541
0542
0543
0544
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556
0557
0558
0559
0560
0561
0562
0563
0564
0565
0566
0567
0568
0569
0570
0571
0572
0573
0574
0575
0576
0577
0578
0579
0580
0581
0582
0583
0584
0585
0586
0587
0588
0589
0590
0591
0592
0593
0594
0595
0596
0597
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620
0621