BASIS  r3148
diff3.py
Go to the documentation of this file.
00001 """
00002 
00003     @file  diff3.py
00004     @brief Three-way file comparison algorithm.
00005 
00006     This is a line-by-line translation of the Text::Diff3 Perl module version
00007     0.10 written by MIZUTANI Tociyuki <tociyuki@gmail.com>.
00008 
00009     The Text::Diff3 Perl module in turn was based on the diff3 program:
00010 
00011     Three way file comparison program (diff3) for Project GNU.
00012     Copyright (C) 1988, 1989, 1992, 1993, 1994 Free Software Foundation, Inc.
00013     Written by Randy Smith
00014 
00015     The two-way file comparison procedure was based on the article:
00016     P. Heckel. ``A technique for isolating differences between files.''
00017     Communications of the ACM, Vol. 21, No. 4, page 264, April 1978.
00018 
00019     Copyright (c) 2012 University of Pennsylvania. All rights reserved.
00020     See https://www.cbica.upenn.edu/sbia/software/license.html or COPYING file.
00021 
00022 """
00023 
00024 from operator import xor
00025 
00026 
00027 # ----------------------------------------------------------------------------
00028 def diff3(yourtext, origtext, theirtext):
00029     """Three-way diff based on the GNU diff3.c by R. Smith.
00030 
00031     @param [in] yourtext   Array of lines of your text.
00032     @param [in] origtext   Array of lines of original text.
00033     @param [in] theirtext   Array of lines of their text.
00034 
00035     @returns Array of tuples containing diff results. The tuples consist of
00036              (cmd, loA, hiA, loB, hiB), where cmd is either one of
00037              '0', '1', '2', or 'A'.
00038 
00039     """
00040     # diff result => [(cmd, loA, hiA, loB, hiB), ...]
00041     d2 = (diff(origtext, yourtext), diff(origtext, theirtext))
00042     d3 = []
00043     r3 = [None,  0, 0,  0, 0,  0, 0]
00044     while d2[0] or d2[1]:
00045         # find a continual range in origtext lo2..hi2
00046         # changed by yourtext or by theirtext.
00047         #
00048         #     d2[0]        222    222222222
00049         #  origtext     ...L!!!!!!!!!!!!!!!!!!!!H...
00050         #     d2[1]          222222   22  2222222
00051         r2 = ([], [])
00052         if not d2[0]: i = 1
00053         else:
00054             if not d2[1]: i = 0
00055             else: 
00056                 if d2[0][0][1] <= d2[1][0][1]: i = 0
00057                 else: i = 1
00058         j  = i
00059         k  = xor(i, 1)
00060         hi = d2[j][0][2]
00061         r2[j].append(d2[j].pop(0))
00062         while d2[k] and d2[k][0][1] <= hi + 1:
00063             hi_k = d2[k][0][2]
00064             r2[k].append(d2[k].pop(0))
00065             if hi < hi_k:
00066                 hi = hi_k
00067                 j  = k
00068                 k  = xor(k, 1)
00069         lo2 = r2[i][ 0][1]
00070         hi2 = r2[j][-1][2]
00071         # take the corresponding ranges in yourtext lo0..hi0
00072         # and in theirtext lo1..hi1.
00073         #
00074         #   yourtext     ..L!!!!!!!!!!!!!!!!!!!!!!!!!!!!H...
00075         #      d2[0]        222    222222222
00076         #   origtext     ...00!1111!000!!00!111111...
00077         #      d2[1]          222222   22  2222222
00078         #  theirtext          ...L!!!!!!!!!!!!!!!!H...
00079         if r2[0]:
00080             lo0 = r2[0][ 0][3] - r2[0][ 0][1] + lo2
00081             hi0 = r2[0][-1][4] - r2[0][-1][2] + hi2
00082         else:
00083             lo0 = r3[2] - r3[6] + lo2
00084             hi0 = r3[2] - r3[6] + hi2
00085         if r2[1]:
00086             lo1 = r2[1][ 0][3] - r2[1][ 0][1] + lo2
00087             hi1 = r2[1][-1][4] - r2[1][-1][2] + hi2
00088         else:
00089             lo1 = r3[4] - r3[6] + lo2
00090             hi1 = r3[4] - r3[6] + hi2
00091         # detect type of changes
00092         if not r2[0]:
00093             cmd = '1'
00094         elif not r2[1]:
00095             cmd = '0'
00096         elif hi0 - lo0 != hi1 - lo1:
00097             cmd = 'A'
00098         else:
00099             cmd = '2'
00100             for d in range(0, hi0 - lo0 + 1):
00101                 (i0, i1) = (lo0 + d - 1, lo1 + d - 1)
00102                 ok0 = (0 <= i0 and i0 < len(yourtext))
00103                 ok1 = (0 <= i1 and i1 < len(theirtext))
00104                 if xor(ok0, ok1) or (ok0 and yourtext[i0] != theirtext[i1]):
00105                     cmd = 'A'
00106                     break
00107         d3.append((cmd,  lo0, hi0,  lo1, hi1,  lo2, hi2))
00108     return d3
00109 
00110 # ----------------------------------------------------------------------------
00111 def merge(yourtext, origtext, theirtext):
00112     res = {'conflict': 0, 'body': []}
00113     d3 = diff3(yourtext, origtext, theirtext)
00114     text3 = (yourtext, theirtext, origtext)
00115     i2 = 1
00116     for r3 in d3:
00117         for lineno in range(i2, r3[5]):
00118             res['body'].append(text3[2][lineno - 1])
00119         if r3[0] == '0':
00120             for lineno in range(r3[1], r3[2] + 1):
00121                 res['body'].append(text3[0][lineno - 1])
00122         elif r3[0] != 'A':
00123             for lineno in range(r3[3], r3[4] + 1):
00124                 res['body'].append(text3[1][lineno - 1])
00125         else:
00126             res = _conflict_range(text3, r3, res)
00127         i2 = r3[6] + 1
00128     for lineno in range(i2, len(text3[2]) + 1):
00129         res['body'].append(text3[2][lineno - 1])
00130     return res
00131 
00132 # ----------------------------------------------------------------------------
00133 def _conflict_range(text3, r3, res):
00134     text_a = [] # their text
00135     for i in range(r3[3], r3[4] + 1):
00136         text_a.append(text3[1][i - 1])
00137     text_b = [] # your text
00138     for i in range(r3[1], r3[2] + 1):
00139         text_b.append(text3[0][i - 1])
00140     d = diff(text_a, text_b)
00141     if _assoc_range(d, 'c') and r3[5] <= r3[6]:
00142         res['conflict'] += 1
00143         res['body'].append('<<<<<<<')
00144         for lineno in range(r3[1], r3[2] + 1):
00145             res['body'].append(text3[0][lineno - 1])
00146         res['body'].append('|||||||')
00147         for lineno in range(r3[5], r3[6] + 1):
00148             res['body'].append(text3[2][lineno - 1])
00149         res['body'].append('=======')
00150         for lineno in range(r3[3], r3[4] + 1):
00151             res['body'].append(text3[1][lineno - 1])
00152         res['body'].append('>>>>>>>')
00153         return res
00154     ia = 1
00155     for r2 in d:
00156         for lineno in range(ia, r2[1]):
00157             res['body'].append(text_a[lineno - 1])
00158         if r2[0] == 'c':
00159             res['conflict'] += 1
00160             res['body'].append('<<<<<<<')
00161             for lineno in range(r2[3], r2[4] + 1):
00162                 res['body'].append(text_b[lineno - 1])
00163             res['body'].append('=======')
00164             for lineno in range(r2[1], r2[2] + 1):
00165                 res['body'].append(text_a[lineno - 1])
00166             res['body'].append('>>>>>>>')
00167         elif r2[0] == 'a':
00168             for lineno in range(r2[3], r2[4] + 1):
00169                 res['body'].append(text_b[lineno - 1])
00170         ia = r2[2] + 1
00171     for lineno in range(ia, len(text_a)):
00172         res['body'].append(text_a[lineno - 1])
00173     return res
00174 
00175 # ----------------------------------------------------------------------------
00176 def _assoc_range(diff, diff_type):
00177     for d in diff:
00178         if d[0] == diff_type: return d
00179     return None
00180 
00181 # ----------------------------------------------------------------------------
00182 def _diff_heckel(text_a, text_b):
00183     """Two-way diff based on the algorithm by P. Heckel.
00184 
00185     @param [in] text_a Array of lines of first text.
00186     @param [in] text_b Array of lines of second text.
00187 
00188     @returns TODO
00189 
00190     """
00191     d    = [];
00192     uniq = [(len(text_a), len(text_b))]
00193     (freq, ap, bp) = ({}, {}, {})
00194     for i in range(len(text_a)):
00195         s = text_a[i]
00196         freq[s] = freq.get(s, 0) + 2;
00197         ap  [s] = i;
00198     for i in range(len(text_b)):
00199         s = text_b[i]
00200         freq[s] = freq.get(s, 0) + 3;
00201         bp  [s] = i;
00202     for s, x in freq.items():
00203         if x == 5: uniq.append((ap[s], bp[s]))
00204     (freq, ap, bp) = ({}, {}, {})
00205     uniq.sort(key=lambda x: x[0])
00206     (a1, b1) = (0, 0)
00207     while a1 < len(text_a) and b1 < len(text_b):
00208         if text_a[a1] != text_b[b1]: break
00209         a1 += 1
00210         b1 += 1
00211     for a_uniq, b_uniq in uniq:
00212         if a_uniq < a1 or b_uniq < b1: continue
00213         (a0, b0) = (a1, b1)
00214         (a1, b1) = (a_uniq - 1, b_uniq - 1)
00215         while a0 <= a1 and b0 <= b1:
00216             if text_a[a1] != text_b[b1]: break
00217             a1 -= 1
00218             b1 -= 1
00219         if a0 <= a1 and b0 <= b1:
00220             d.append(('c', a0 + 1, a1 + 1, b0 + 1, b1 + 1))
00221         elif a0 <= a1:
00222             d.append(('d', a0 + 1, a1 + 1, b0 + 1, b0))
00223         elif b0 <= b1:
00224             d.append(('a', a0 + 1, a0, b0 + 1, b1 + 1))
00225         (a1, b1) = (a_uniq + 1, b_uniq + 1)
00226         while a1 < len(text_a) and b1 < len(text_b):
00227             if text_a[a1] != text_b[b1]: break
00228             a1 += 1
00229             b1 += 1
00230     return d
00231 
00232 # ----------------------------------------------------------------------------
00233 diff = _diff_heckel # default two-way diff function used by diff3()