mercurial/pure/bdiff.py
author Arseniy Alekseyev <aalekseyev@janestreet.com>
Fri, 26 Apr 2024 19:10:35 +0100
changeset 51626 865efc020c33
parent 49598 594fc56c0af7
permissions -rw-r--r--
dirstate: remove the python-side whitelist of allowed matchers This whitelist is too permissive because it allows matchers that contain disallowed ones deep inside, for example through `intersectionmatcher`. It is also too restrictive because it doesn't pass through some of the matchers we support, such as `patternmatcher`. It's also unnecessary because unsupported matchers raise `FallbackError` and we fall back anyway. Making this change makes more of the tests use rust code path, and therefore subtly change behavior. For example, rust status in largefiles repos seems to have strange behavior.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
     1
# bdiff.py - Python implementation of bdiff.c
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
     2
#
46819
d4ba4d51f85f contributor: change mentions of mpm to olivia
Raphaël Gomès <rgomes@octobus.net>
parents: 45863
diff changeset
     3
# Copyright 2009 Olivia Mackall <olivia@selenic.com> and others
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
     4
#
8225
46293a0c7e9f updated license to be explicit about GPL version 2
Martin Geisler <mg@lazybytes.net>
parents: 7944
diff changeset
     5
# This software may be used and distributed according to the terms of the
10263
25e572394f5c Update license to GPLv2+
Matt Mackall <mpm@selenic.com>
parents: 8225
diff changeset
     6
# GNU General Public License version 2 or any later version.
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
     7
27335
c4e3ff497f89 bdiff: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 15530
diff changeset
     8
c4e3ff497f89 bdiff: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 15530
diff changeset
     9
import difflib
c4e3ff497f89 bdiff: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 15530
diff changeset
    10
import re
c4e3ff497f89 bdiff: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 15530
diff changeset
    11
import struct
7944
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    12
49598
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    13
from typing import (
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    14
    List,
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    15
    Tuple,
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    16
)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    17
49598
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    18
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    19
def splitnewlines(text: bytes) -> List[bytes]:
7944
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    20
    '''like str.splitlines, but only split on newlines.'''
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    21
    lines = [l + b'\n' for l in text.split(b'\n')]
7944
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    22
    if lines:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    23
        if lines[-1] == b'\n':
7944
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    24
            lines.pop()
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    25
        else:
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    26
            lines[-1] = lines[-1][:-1]
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    27
    return lines
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    28
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    29
49598
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    30
def _normalizeblocks(
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    31
    a: List[bytes], b: List[bytes], blocks
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    32
) -> List[Tuple[int, int, int]]:
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    33
    prev = None
14066
14fac6c0536a pure bdiff: don't use a generator
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 10282
diff changeset
    34
    r = []
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    35
    for curr in blocks:
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    36
        if prev is None:
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    37
            prev = curr
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    38
            continue
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    39
        shift = 0
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    40
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    41
        a1, b1, l1 = prev
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    42
        a1end = a1 + l1
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    43
        b1end = b1 + l1
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    44
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    45
        a2, b2, l2 = curr
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    46
        a2end = a2 + l2
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    47
        b2end = b2 + l2
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    48
        if a1end == a2:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    49
            while (
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    50
                a1end + shift < a2end and a[a1end + shift] == b[b1end + shift]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    51
            ):
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    52
                shift += 1
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    53
        elif b1end == b2:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    54
            while (
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    55
                b1end + shift < b2end and a[a1end + shift] == b[b1end + shift]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    56
            ):
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    57
                shift += 1
14066
14fac6c0536a pure bdiff: don't use a generator
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 10282
diff changeset
    58
        r.append((a1, b1, l1 + shift))
10282
08a0f04b56bd many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents: 10263
diff changeset
    59
        prev = a2 + shift, b2 + shift, l2 - shift
45863
68aedad4c11c pure: guard against empty blocks
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43077
diff changeset
    60
68aedad4c11c pure: guard against empty blocks
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43077
diff changeset
    61
    if prev is not None:
68aedad4c11c pure: guard against empty blocks
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43077
diff changeset
    62
        r.append(prev)
68aedad4c11c pure: guard against empty blocks
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43077
diff changeset
    63
14066
14fac6c0536a pure bdiff: don't use a generator
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 10282
diff changeset
    64
    return r
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    65
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    66
49598
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    67
def bdiff(a: bytes, b: bytes) -> bytes:
31641
f2b334e6c7e0 py3: use bytes() to cast to immutable bytes in pure.bdiff.bdiff()
Yuya Nishihara <yuya@tcha.org>
parents: 31640
diff changeset
    68
    a = bytes(a).splitlines(True)
f2b334e6c7e0 py3: use bytes() to cast to immutable bytes in pure.bdiff.bdiff()
Yuya Nishihara <yuya@tcha.org>
parents: 31640
diff changeset
    69
    b = bytes(b).splitlines(True)
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    70
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    71
    if not a:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    72
        s = b"".join(b)
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    73
        return s and (struct.pack(b">lll", 0, 0, len(s)) + s)
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    74
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    75
    bin = []
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    76
    p = [0]
34435
5326e4ef1dab style: never put multiple statements on one line
Alex Gaynor <agaynor@mozilla.com>
parents: 32512
diff changeset
    77
    for i in a:
5326e4ef1dab style: never put multiple statements on one line
Alex Gaynor <agaynor@mozilla.com>
parents: 32512
diff changeset
    78
        p.append(p[-1] + len(i))
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    79
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    80
    d = difflib.SequenceMatcher(None, a, b).get_matching_blocks()
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    81
    d = _normalizeblocks(a, b, d)
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    82
    la = 0
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    83
    lb = 0
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    84
    for am, bm, size in d:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    85
        s = b"".join(b[lb:bm])
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    86
        if am > la or s:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    87
            bin.append(struct.pack(b">lll", p[la], p[am], len(s)) + s)
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    88
        la = am + size
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    89
        lb = bm + size
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    90
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    91
    return b"".join(bin)
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    92
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
    93
49598
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
    94
def blocks(a: bytes, b: bytes) -> List[Tuple[int, int, int, int]]:
7944
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    95
    an = splitnewlines(a)
e9b48afd0e78 pure/bdiff: fix circular import
Matt Mackall <mpm@selenic.com>
parents: 7703
diff changeset
    96
    bn = splitnewlines(b)
7703
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    97
    d = difflib.SequenceMatcher(None, an, bn).get_matching_blocks()
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    98
    d = _normalizeblocks(an, bn, d)
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
    99
    return [(i, i + n, j, j + n) for (i, j, n) in d]
9044d3567f6d pure Python implementation of bdiff.c
Martin Geisler <mg@daimi.au.dk>
parents:
diff changeset
   100
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41269
diff changeset
   101
49598
594fc56c0af7 typing: add type hints to bdiff implementations
Matt Harbison <matt_harbison@yahoo.com>
parents: 48875
diff changeset
   102
def fixws(text: bytes, allws: bool) -> bytes:
15530
eeac5e179243 mdiff: replace wscleanup() regexps with C loops
Patrick Mezard <pmezard@gmail.com>
parents: 14066
diff changeset
   103
    if allws:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   104
        text = re.sub(b'[ \t\r]+', b'', text)
15530
eeac5e179243 mdiff: replace wscleanup() regexps with C loops
Patrick Mezard <pmezard@gmail.com>
parents: 14066
diff changeset
   105
    else:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   106
        text = re.sub(b'[ \t\r]+', b' ', text)
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   107
        text = text.replace(b' \n', b'\n')
15530
eeac5e179243 mdiff: replace wscleanup() regexps with C loops
Patrick Mezard <pmezard@gmail.com>
parents: 14066
diff changeset
   108
    return text